bookdata/goodreads/
work.rs

1//! GoodReads work schemas and record processing.
2use parquet_derive::ParquetRecordWriter;
3use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::parsing::*;
7use crate::prelude::*;
8
9const OUT_FILE: &'static str = "gr-work-info.parquet";
10
11/// Work records as parsed from JSON.
12#[derive(Deserialize)]
13pub struct RawWork {
14    pub work_id: String,
15    #[serde(default)]
16    pub original_title: String,
17    #[serde(default)]
18    pub original_publication_year: String,
19    #[serde(default)]
20    pub original_publication_month: String,
21    #[serde(default)]
22    #[allow(unused)]
23    pub original_publication_day: String,
24}
25
26/// Rows in the processed work Parquet table.
27#[derive(ParquetRecordWriter)]
28pub struct WorkRecord {
29    pub work_id: i32,
30    pub title: Option<String>,
31    pub pub_year: Option<i16>,
32    pub pub_month: Option<u8>,
33}
34
35/// Object writer to transform and write GoodReads works
36pub struct WorkWriter {
37    writer: TableWriter<WorkRecord>,
38    n_recs: usize,
39}
40
41impl WorkWriter {
42    /// Open a new output
43    pub fn open() -> Result<WorkWriter> {
44        let writer = TableWriter::open(OUT_FILE)?;
45        Ok(WorkWriter { writer, n_recs: 0 })
46    }
47}
48
49impl ObjectWriter<RawWork> for WorkWriter {
50    fn write_object(&mut self, row: RawWork) -> Result<()> {
51        let work_id: i32 = row.work_id.parse()?;
52
53        let pub_year = parse_opt(&row.original_publication_year)?;
54        let pub_month = parse_opt(&row.original_publication_month)?;
55
56        self.writer.write_object(WorkRecord {
57            work_id,
58            title: trim_owned(&row.original_title),
59            pub_year,
60            pub_month,
61        })?;
62        self.n_recs += 1;
63        Ok(())
64    }
65
66    fn finish_objects(self) -> Result<usize> {
67        self.writer.finish_objects()?;
68        Ok(self.n_recs)
69    }
70}