bookdata/goodreads/
book.rs

1//! GoodReads book schemas and record processing.
2use parquet_derive::ParquetRecordWriter;
3use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::*;
7use crate::ids::codes::NS_GR_BOOK;
8use crate::ids::codes::NS_GR_WORK;
9use crate::parsing::*;
10use crate::prelude::*;
11
12const ID_FILE: &'static str = "gr-book-ids.parquet";
13const INFO_FILE: &'static str = "gr-book-info.parquet";
14const SERIES_FILE: &'static str = "gr-book-series.parquet";
15const AUTHOR_FILE: &'static str = "gr-book-authors.parquet";
16
17/// The raw records we read from JSON
18#[allow(dead_code)]
19#[derive(Deserialize)]
20pub struct RawBook {
21    pub book_id: String,
22    pub work_id: String,
23    pub isbn: String,
24    pub isbn13: String,
25    pub asin: String,
26    #[serde(default)]
27    pub title: String,
28    #[serde(default)]
29    pub authors: Vec<RawAuthor>,
30    #[serde(default)]
31    pub publication_year: String,
32    #[serde(default)]
33    pub publication_month: String,
34    #[serde(default)]
35    pub publication_day: String,
36    #[serde(default)]
37    pub series: Vec<String>,
38}
39
40/// The raw author records from JSON.
41#[derive(Deserialize)]
42pub struct RawAuthor {
43    pub author_id: String,
44    #[serde(default)]
45    pub role: String,
46}
47
48/// the book ID records to write to Parquet.
49#[derive(ParquetRecordWriter)]
50pub struct BookIdRecord {
51    /// The book ID, converted from UUID.
52    pub book_id: i32,
53    /// The work ID, converted from UUID.
54    pub work_id: Option<i32>,
55    /// The integrated item ID, converted from book and work IDs projected into number spaces.
56    pub item_id: i32,
57    pub isbn10: Option<String>,
58    pub isbn13: Option<String>,
59    pub asin: Option<String>,
60}
61
62/// book info records to actually write
63#[derive(ParquetRecordWriter)]
64pub struct BookRecord {
65    pub book_id: i32,
66    pub title: Option<String>,
67    pub pub_year: Option<u16>,
68    pub pub_month: Option<u8>,
69}
70
71/// book series linking records
72#[derive(ParquetRecordWriter)]
73pub struct BookSeriesRecord {
74    pub book_id: i32,
75    pub series: String,
76}
77
78/// book author linking records
79#[derive(ParquetRecordWriter)]
80pub struct BookAuthorRecord {
81    pub book_id: i32,
82    pub author_id: i32,
83    pub position: i16,
84    pub role: Option<String>,
85}
86
87/// Output handler for GoodReads books.
88pub struct BookWriter {
89    id_out: TableWriter<BookIdRecord>,
90    info_out: TableWriter<BookRecord>,
91    author_out: TableWriter<BookAuthorRecord>,
92    series_out: TableWriter<BookSeriesRecord>,
93}
94
95impl BookWriter {
96    pub fn open() -> Result<BookWriter> {
97        let id_out = TableWriter::open(ID_FILE)?;
98        let info_out = TableWriter::open(INFO_FILE)?;
99        let author_out = TableWriter::open(AUTHOR_FILE)?;
100        let series_out = TableWriter::open(SERIES_FILE)?;
101        Ok(BookWriter {
102            id_out,
103            info_out,
104            author_out,
105            series_out,
106        })
107    }
108}
109
110impl DataSink for BookWriter {
111    fn output_files<'a>(&'a self) -> Vec<PathBuf> {
112        path_list(&[ID_FILE, INFO_FILE, AUTHOR_FILE, SERIES_FILE])
113    }
114}
115
116impl ObjectWriter<RawBook> for BookWriter {
117    fn write_object(&mut self, row: RawBook) -> Result<()> {
118        let book_id = row.book_id.parse()?;
119        let work_id = parse_opt(&row.work_id)?;
120        let item_id = if let Some(w) = work_id {
121            NS_GR_WORK.to_code(w)
122        } else {
123            NS_GR_BOOK.to_code(book_id)
124        };
125
126        self.id_out.write_object(BookIdRecord {
127            book_id,
128            work_id,
129            item_id,
130            isbn10: trim_opt(&row.isbn)
131                .map(|s| clean_asin_chars(s))
132                .filter(|s| s.len() >= 7),
133            isbn13: trim_opt(&row.isbn13)
134                .map(|s| clean_asin_chars(s))
135                .filter(|s| s.len() >= 7),
136            asin: trim_opt(&row.asin)
137                .map(|s| clean_asin_chars(s))
138                .filter(|s| s.len() >= 7),
139        })?;
140
141        let pub_year = parse_opt(&row.publication_year)?;
142        let pub_month = parse_opt(&row.publication_month)?;
143
144        self.info_out.write_object(BookRecord {
145            book_id,
146            title: trim_owned(&row.title),
147            pub_year,
148            pub_month,
149        })?;
150
151        for (i, author) in row.authors.into_iter().enumerate() {
152            self.author_out.write_object(BookAuthorRecord {
153                book_id,
154                author_id: author.author_id.parse()?,
155                position: (i as i16) + 1,
156                role: Some(author.role).filter(|s| !s.is_empty()),
157            })?;
158        }
159
160        for series in row.series {
161            self.series_out
162                .write_object(BookSeriesRecord { book_id, series })?;
163        }
164
165        Ok(())
166    }
167
168    fn finish(self) -> Result<usize> {
169        self.id_out.finish()?;
170        self.info_out.finish()?;
171        self.author_out.finish()?;
172        self.series_out.finish()?;
173        Ok(0)
174    }
175}