bookdata/goodreads/
genres.rs1use std::collections::HashMap;
3
4use parquet_derive::ParquetRecordWriter;
5use serde::Deserialize;
6
7use crate::arrow::*;
8use crate::ids::index::IdIndex;
9use crate::prelude::*;
10
11const OUT_FILE: &'static str = "gr-book-genres.parquet";
12const GENRE_FILE: &'static str = "gr-genres.parquet";
13
14#[derive(Deserialize)]
16pub struct RawBookGenre {
17 pub book_id: String,
18 #[serde(default)]
19 pub genres: HashMap<String, i32>,
20}
21
22#[derive(ParquetRecordWriter)]
24pub struct BookGenreRecord {
25 pub book_id: i32,
26 pub genre_id: i32,
27 pub count: i32,
28}
29
30pub struct BookGenreWriter {
32 genres: IdIndex<String>,
33 writer: TableWriter<BookGenreRecord>,
34 n_recs: usize,
35}
36
37impl BookGenreWriter {
38 pub fn open() -> Result<BookGenreWriter> {
40 let writer = TableWriter::open(OUT_FILE)?;
41 Ok(BookGenreWriter {
42 genres: IdIndex::new(),
43 writer,
44 n_recs: 0,
45 })
46 }
47}
48
49impl ObjectWriter<RawBookGenre> for BookGenreWriter {
50 fn write_object(&mut self, row: RawBookGenre) -> Result<()> {
51 let book_id: i32 = row.book_id.parse()?;
52
53 for (genre, count) in row.genres {
54 let genre_id = self.genres.intern(&genre)?;
55 self.writer.write_object(BookGenreRecord {
56 book_id,
57 genre_id,
58 count,
59 })?;
60
61 self.n_recs += 1;
62 }
63
64 Ok(())
65 }
66
67 fn finish_objects(self) -> Result<usize> {
68 self.writer.finish_objects()?;
69 self.genres.save(GENRE_FILE, "genre_id", "genre")?;
70 Ok(self.n_recs)
71 }
72}