bookdata/goodreads/
review.rs

1//! GoodReads review data model.
2use chrono::NaiveDateTime;
3use parquet_derive::ParquetRecordWriter;
4pub use serde::Deserialize;
5
6use crate::arrow::*;
7use crate::ids::index::IdIndex;
8use crate::parsing::dates::*;
9use crate::parsing::*;
10use crate::prelude::*;
11
12use super::ids::load_id_links;
13use super::ids::BookLinkMap;
14use super::users::load_user_index;
15
16const OUT_FILE: &'static str = "gr-reviews.parquet";
17
18/// Review records we read from JSON.
19#[derive(Deserialize)]
20pub struct RawReview {
21    pub user_id: String,
22    pub book_id: String,
23    pub review_id: String,
24    pub rating: f32,
25    pub review_text: String,
26    pub n_votes: i32,
27    pub date_added: String,
28    pub date_updated: String,
29    #[allow(unused)]
30    pub read_at: String,
31    #[allow(unused)]
32    pub started_at: String,
33}
34
35/// Review records to write to the Parquet table.
36#[derive(ParquetRecordWriter)]
37pub struct ReviewRecord {
38    /// Internal auto-genereated record identifier.
39    pub rec_id: u32,
40    /// Review identifier (derived from input).
41    pub review_id: i64,
42    /// User identifier.
43    pub user_id: i32,
44    /// GoodReads book identifier.
45    pub book_id: i32,
46    /// GoodReads work identifier.
47    pub work_id: Option<i32>,
48    /// Cluster identifier (from [integration clustering][clust]).
49    ///
50    /// [clust]: https://bookdata.piret.info/data/cluster.html
51    pub cluster: i32,
52    /// GoodReads “item” identifier
53    pub item_id: i32,
54    /// Rating associated with this review (if provided).
55    pub rating: Option<f32>,
56    /// Review text.
57    pub review: String,
58    /// Number of votes this review has received.
59    pub n_votes: i32,
60    /// Date review was added.
61    pub added: Option<NaiveDateTime>,
62    /// Date review was updated.
63    pub updated: Option<NaiveDateTime>,
64}
65
66// Object writer to transform and write GoodReads reviews
67pub struct ReviewWriter {
68    writer: TableWriter<ReviewRecord>,
69    users: IdIndex<String>,
70    books: BookLinkMap,
71    n_recs: u32,
72}
73
74impl ReviewWriter {
75    // Open a new output
76    pub fn open() -> Result<ReviewWriter> {
77        let writer = TableWriter::open(OUT_FILE)?;
78        let users = load_user_index()?.freeze();
79        let books = load_id_links()?;
80        Ok(ReviewWriter {
81            writer,
82            users,
83            books,
84            n_recs: 0,
85        })
86    }
87}
88
89impl ObjectWriter<RawReview> for ReviewWriter {
90    // Write a single interaction to the output
91    fn write_object(&mut self, row: RawReview) -> Result<()> {
92        self.n_recs += 1;
93        let rec_id = self.n_recs;
94        let user_id = self.users.intern_owned(row.user_id)?;
95        let book_id: i32 = row.book_id.parse()?;
96        let (rev_hi, rev_lo) = decode_hex_i64_pair(&row.review_id)?;
97        // review ids were checked for dupluicates in interaction scan, don't repeat here
98        let review_id = rev_hi ^ rev_lo;
99        let link = self
100            .books
101            .get(&book_id)
102            .ok_or_else(|| anyhow!("unknown book ID"))?;
103
104        self.writer.write_object(ReviewRecord {
105            rec_id,
106            review_id,
107            user_id,
108            book_id,
109            work_id: link.work_id,
110            item_id: link.item_id(),
111            cluster: link.cluster,
112            review: row.review_text,
113            rating: if row.rating > 0.0 {
114                Some(row.rating)
115            } else {
116                None
117            },
118            n_votes: row.n_votes,
119            added: parse_gr_date(&row.date_added).map(check_ts("added", 2000))?,
120            updated: parse_gr_date(&row.date_updated).map(check_ts("updated", 2000))?,
121        })?;
122
123        Ok(())
124    }
125
126    // Clean up and finalize output
127    fn finish_objects(self) -> Result<usize> {
128        info!(
129            "wrote {} records for {} users, closing output",
130            self.n_recs,
131            self.users.len()
132        );
133        self.writer.finish_objects()
134    }
135}