bookdata/goodreads/
ids.rs1use std::{collections::HashMap, fs::File};
3
4use anyhow::{anyhow, Result};
5use arrow::array::{Array, Int32Array};
6use log::*;
7use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask};
8use serde::{Deserialize, Serialize};
9
10use crate::{
11 ids::codes::{NS_GR_BOOK, NS_GR_WORK},
12 prelude::BDPath,
13};
14
15pub type BookLinkMap = HashMap<i32, BookLinkRecord>;
16
17const GR_LINK_FILE: BDPath<'static> = BDPath::new("goodreads/gr-book-link.parquet");
18
19#[derive(Debug, Serialize, Deserialize)]
21pub struct BookLinkRecord {
22 pub book_id: i32,
23 pub work_id: Option<i32>,
24 pub cluster: i32,
25}
26
27impl BookLinkRecord {
28 pub fn item_id(&self) -> i32 {
30 if let Some(w) = &self.work_id {
31 NS_GR_WORK.base() + w
32 } else {
33 NS_GR_BOOK.base() + self.book_id
34 }
35 }
36}
37
38pub fn load_id_links() -> Result<BookLinkMap> {
40 let path = GR_LINK_FILE.resolve()?;
41 let file = File::open(path)?;
42
43 let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
44 let meta = builder.metadata();
45 let n_rows = meta.file_metadata().num_rows();
46 let project = ProjectionMask::columns(
47 meta.file_metadata().schema_descr(),
48 ["book_id", "work_id", "cluster"],
49 );
50 let reader = builder.with_projection(project).build()?;
51
52 let mut map = HashMap::with_capacity(n_rows as usize);
53
54 for batch in reader {
55 let batch = batch?;
56 assert_eq!(batch.schema().field(0).name(), "book_id");
57 assert_eq!(batch.schema().field(1).name(), "work_id");
58 assert_eq!(batch.schema().field(2).name(), "cluster");
59 let book_id_col = batch
60 .column(0)
61 .as_any()
62 .downcast_ref::<Int32Array>()
63 .ok_or_else(|| anyhow!("invalid book_id column"))?;
64 let work_id_col = batch
65 .column(1)
66 .as_any()
67 .downcast_ref::<Int32Array>()
68 .ok_or_else(|| anyhow!("invalid work_id column"))?;
69 let cluster_col = batch
70 .column(2)
71 .as_any()
72 .downcast_ref::<Int32Array>()
73 .ok_or_else(|| anyhow!("invalid cluster column"))?;
74
75 for i in 0..batch.num_rows() {
76 let rec: BookLinkRecord = BookLinkRecord {
77 book_id: book_id_col.value(i),
78 work_id: if work_id_col.is_valid(i) {
79 Some(work_id_col.value(i))
80 } else {
81 None
82 },
83 cluster: cluster_col.value(i),
84 };
85 map.insert(rec.book_id, rec);
86 }
87 }
88
89 info!("read {} book links from {}", map.len(), GR_LINK_FILE);
90 Ok(map)
91}