bookdata/goodreads/
genres.rs

1//! GoodReads book genre records.
2use std::collections::HashMap;
3
4use parquet_derive::ParquetRecordWriter;
5use serde::Deserialize;
6
7use crate::arrow::*;
8use crate::ids::index::IdIndex;
9use crate::prelude::*;
10
11const OUT_FILE: &'static str = "gr-book-genres.parquet";
12const GENRE_FILE: &'static str = "gr-genres.parquet";
13
14/// Book-genre records as parsed from JSON.
15#[derive(Deserialize)]
16pub struct RawBookGenre {
17    pub book_id: String,
18    #[serde(default)]
19    pub genres: HashMap<String, i32>,
20}
21
22/// Rows in the processed book-genre Parquet table.
23#[derive(ParquetRecordWriter)]
24pub struct BookGenreRecord {
25    pub book_id: i32,
26    pub genre_id: i32,
27    pub count: i32,
28}
29
30/// Object writer to transform and write GoodReads book-genre records
31pub struct BookGenreWriter {
32    genres: IdIndex<String>,
33    writer: TableWriter<BookGenreRecord>,
34    n_recs: usize,
35}
36
37impl BookGenreWriter {
38    /// Open a new output
39    pub fn open() -> Result<BookGenreWriter> {
40        let writer = TableWriter::open(OUT_FILE)?;
41        Ok(BookGenreWriter {
42            genres: IdIndex::new(),
43            writer,
44            n_recs: 0,
45        })
46    }
47}
48
49impl ObjectWriter<RawBookGenre> for BookGenreWriter {
50    fn write_object(&mut self, row: RawBookGenre) -> Result<()> {
51        let book_id: i32 = row.book_id.parse()?;
52
53        for (genre, count) in row.genres {
54            let genre_id = self.genres.intern(&genre)?;
55            self.writer.write_object(BookGenreRecord {
56                book_id,
57                genre_id,
58                count,
59            })?;
60
61            self.n_recs += 1;
62        }
63
64        Ok(())
65    }
66
67    fn finish_objects(self) -> Result<usize> {
68        self.writer.finish_objects()?;
69        self.genres.save(GENRE_FILE, "genre_id", "genre")?;
70        Ok(self.n_recs)
71    }
72}