bookdata/goodreads/
book.rs

1//! GoodReads book schemas and record processing.
2use parquet_derive::ParquetRecordWriter;
3use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::*;
7use crate::ids::codes::NS_GR_BOOK;
8use crate::ids::codes::NS_GR_WORK;
9use crate::parsing::*;
10use crate::prelude::*;
11
12const ID_FILE: &'static str = "gr-book-ids.parquet";
13const INFO_FILE: &'static str = "gr-book-info.parquet";
14const SERIES_FILE: &'static str = "gr-book-series.parquet";
15const AUTHOR_FILE: &'static str = "gr-book-authors.parquet";
16
17/// The raw records we read from JSON
18#[allow(dead_code)]
19#[derive(Deserialize)]
20pub struct RawBook {
21    pub book_id: String,
22    pub work_id: String,
23    pub isbn: String,
24    pub isbn13: String,
25    pub asin: String,
26    #[serde(default)]
27    pub title: String,
28    #[serde(default)]
29    pub authors: Vec<RawAuthor>,
30    #[serde(default)]
31    pub publication_year: String,
32    #[serde(default)]
33    pub publication_month: String,
34    #[serde(default)]
35    pub publication_day: String,
36    #[serde(default)]
37    pub series: Vec<String>,
38}
39
40/// The raw author records from JSON.
41#[derive(Deserialize)]
42pub struct RawAuthor {
43    pub author_id: String,
44    #[serde(default)]
45    pub role: String,
46}
47
48/// the book ID records to write to Parquet.
49#[derive(ParquetRecordWriter)]
50pub struct BookIdRecord {
51    /// The book ID, converted from UUID.
52    pub book_id: i32,
53    /// The work ID, converted from UUID.
54    pub work_id: Option<i32>,
55    /// The integrated item ID, converted from book and work IDs projected into number spaces.
56    pub item_id: i32,
57    pub isbn10: Option<String>,
58    pub isbn13: Option<String>,
59    pub asin: Option<String>,
60}
61
62/// book info records to actually write
63#[derive(ParquetRecordWriter)]
64pub struct BookRecord {
65    pub book_id: i32,
66    pub title: Option<String>,
67    pub pub_year: Option<u16>,
68    pub pub_month: Option<u8>,
69}
70
71/// book series linking records
72#[derive(ParquetRecordWriter)]
73pub struct BookSeriesRecord {
74    pub book_id: i32,
75    pub series: String,
76}
77
78/// book author linking records
79#[derive(ParquetRecordWriter)]
80pub struct BookAuthorRecord {
81    pub book_id: i32,
82    pub author_id: i32,
83    pub position: i16,
84    pub role: Option<String>,
85}
86
87/// Output handler for GoodReads books.
88pub struct BookWriter {
89    id_out: TableWriter<BookIdRecord>,
90    info_out: TableWriter<BookRecord>,
91    author_out: TableWriter<BookAuthorRecord>,
92    series_out: TableWriter<BookSeriesRecord>,
93}
94
95impl BookWriter {
96    pub fn open() -> Result<BookWriter> {
97        let id_out = TableWriter::open(ID_FILE)?;
98        let info_out = TableWriter::open(INFO_FILE)?;
99        let author_out = TableWriter::open(AUTHOR_FILE)?;
100        let series_out = TableWriter::open(SERIES_FILE)?;
101        Ok(BookWriter {
102            id_out,
103            info_out,
104            author_out,
105            series_out,
106        })
107    }
108}
109
110impl ObjectWriter<RawBook> for BookWriter {
111    fn write_object(&mut self, row: RawBook) -> Result<()> {
112        let book_id = row.book_id.parse()?;
113        let work_id = parse_opt(&row.work_id)?;
114        let item_id = if let Some(w) = work_id {
115            NS_GR_WORK.to_code(w)
116        } else {
117            NS_GR_BOOK.to_code(book_id)
118        };
119
120        self.id_out.write_object(BookIdRecord {
121            book_id,
122            work_id,
123            item_id,
124            isbn10: trim_opt(&row.isbn)
125                .map(|s| clean_asin_chars(s))
126                .filter(|s| s.len() >= 7),
127            isbn13: trim_opt(&row.isbn13)
128                .map(|s| clean_asin_chars(s))
129                .filter(|s| s.len() >= 7),
130            asin: trim_opt(&row.asin)
131                .map(|s| clean_asin_chars(s))
132                .filter(|s| s.len() >= 7),
133        })?;
134
135        let pub_year = parse_opt(&row.publication_year)?;
136        let pub_month = parse_opt(&row.publication_month)?;
137
138        self.info_out.write_object(BookRecord {
139            book_id,
140            title: trim_owned(&row.title),
141            pub_year,
142            pub_month,
143        })?;
144
145        for (i, author) in row.authors.into_iter().enumerate() {
146            self.author_out.write_object(BookAuthorRecord {
147                book_id,
148                author_id: author.author_id.parse()?,
149                position: (i as i16) + 1,
150                role: Some(author.role).filter(|s| !s.is_empty()),
151            })?;
152        }
153
154        for series in row.series {
155            self.series_out
156                .write_object(BookSeriesRecord { book_id, series })?;
157        }
158
159        Ok(())
160    }
161
162    fn finish_objects(self) -> Result<usize> {
163        self.id_out.finish_objects()?;
164        self.info_out.finish_objects()?;
165        self.author_out.finish_objects()?;
166        self.series_out.finish_objects()?;
167        Ok(0)
168    }
169}