bookdata/goodreads/
review.rs1use chrono::NaiveDateTime;
3use parquet_derive::ParquetRecordWriter;
4pub use serde::Deserialize;
5
6use crate::arrow::*;
7use crate::ids::index::IdIndex;
8use crate::parsing::dates::*;
9use crate::parsing::*;
10use crate::prelude::*;
11
12use super::ids::load_id_links;
13use super::ids::BookLinkMap;
14use super::users::load_user_index;
15
16const OUT_FILE: &'static str = "gr-reviews.parquet";
17
18#[derive(Deserialize)]
20pub struct RawReview {
21 pub user_id: String,
22 pub book_id: String,
23 pub review_id: String,
24 pub rating: f32,
25 pub review_text: String,
26 pub n_votes: i32,
27 pub date_added: String,
28 pub date_updated: String,
29 #[allow(unused)]
30 pub read_at: String,
31 #[allow(unused)]
32 pub started_at: String,
33}
34
35#[derive(ParquetRecordWriter)]
37pub struct ReviewRecord {
38 pub rec_id: u32,
40 pub review_id: i64,
42 pub user_id: i32,
44 pub book_id: i32,
46 pub work_id: Option<i32>,
48 pub cluster: i32,
52 pub item_id: i32,
54 pub rating: Option<f32>,
56 pub review: String,
58 pub n_votes: i32,
60 pub added: Option<NaiveDateTime>,
62 pub updated: Option<NaiveDateTime>,
64}
65
66pub struct ReviewWriter {
68 writer: TableWriter<ReviewRecord>,
69 users: IdIndex<String>,
70 books: BookLinkMap,
71 n_recs: u32,
72}
73
74impl ReviewWriter {
75 pub fn open() -> Result<ReviewWriter> {
77 let writer = TableWriter::open(OUT_FILE)?;
78 let users = load_user_index()?.freeze();
79 let books = load_id_links()?;
80 Ok(ReviewWriter {
81 writer,
82 users,
83 books,
84 n_recs: 0,
85 })
86 }
87}
88
89impl ObjectWriter<RawReview> for ReviewWriter {
90 fn write_object(&mut self, row: RawReview) -> Result<()> {
92 self.n_recs += 1;
93 let rec_id = self.n_recs;
94 let user_id = self.users.intern_owned(row.user_id)?;
95 let book_id: i32 = row.book_id.parse()?;
96 let (rev_hi, rev_lo) = decode_hex_i64_pair(&row.review_id)?;
97 let review_id = rev_hi ^ rev_lo;
99 let link = self
100 .books
101 .get(&book_id)
102 .ok_or_else(|| anyhow!("unknown book ID"))?;
103
104 self.writer.write_object(ReviewRecord {
105 rec_id,
106 review_id,
107 user_id,
108 book_id,
109 work_id: link.work_id,
110 item_id: link.item_id(),
111 cluster: link.cluster,
112 review: row.review_text,
113 rating: if row.rating > 0.0 {
114 Some(row.rating)
115 } else {
116 None
117 },
118 n_votes: row.n_votes,
119 added: parse_gr_date(&row.date_added).map(check_ts("added", 2000))?,
120 updated: parse_gr_date(&row.date_updated).map(check_ts("updated", 2000))?,
121 })?;
122
123 Ok(())
124 }
125
126 fn finish_objects(self) -> Result<usize> {
128 info!(
129 "wrote {} records for {} users, closing output",
130 self.n_recs,
131 self.users.len()
132 );
133 self.writer.finish_objects()
134 }
135}