bookdata/goodreads/
ids.rs

1//! GoodReads book identifier and linking support.
2use std::{collections::HashMap, fs::File};
3
4use anyhow::{anyhow, Result};
5use arrow::array::{Array, Int32Array};
6use log::*;
7use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask};
8use serde::{Deserialize, Serialize};
9
10use crate::{
11    ids::codes::{NS_GR_BOOK, NS_GR_WORK},
12    prelude::BDPath,
13};
14
15pub type BookLinkMap = HashMap<i32, BookLinkRecord>;
16
17const GR_LINK_FILE: BDPath<'static> = BDPath::new("goodreads/gr-book-link.parquet");
18
19/// Book-link record.
20#[derive(Debug, Serialize, Deserialize)]
21pub struct BookLinkRecord {
22    pub book_id: i32,
23    pub work_id: Option<i32>,
24    pub cluster: i32,
25}
26
27impl BookLinkRecord {
28    /// Get the GoodReads item ID for the book (work id, with fallback to book, in numberspace).
29    pub fn item_id(&self) -> i32 {
30        if let Some(w) = &self.work_id {
31            NS_GR_WORK.base() + w
32        } else {
33            NS_GR_BOOK.base() + self.book_id
34        }
35    }
36}
37
38/// Read a map of book IDs to linking identifiers.
39pub fn load_id_links() -> Result<BookLinkMap> {
40    let path = GR_LINK_FILE.resolve()?;
41    let file = File::open(path)?;
42
43    let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
44    let meta = builder.metadata();
45    let n_rows = meta.file_metadata().num_rows();
46    let project = ProjectionMask::columns(
47        meta.file_metadata().schema_descr(),
48        ["book_id", "work_id", "cluster"],
49    );
50    let reader = builder.with_projection(project).build()?;
51
52    let mut map = HashMap::with_capacity(n_rows as usize);
53
54    for batch in reader {
55        let batch = batch?;
56        assert_eq!(batch.schema().field(0).name(), "book_id");
57        assert_eq!(batch.schema().field(1).name(), "work_id");
58        assert_eq!(batch.schema().field(2).name(), "cluster");
59        let book_id_col = batch
60            .column(0)
61            .as_any()
62            .downcast_ref::<Int32Array>()
63            .ok_or_else(|| anyhow!("invalid book_id column"))?;
64        let work_id_col = batch
65            .column(1)
66            .as_any()
67            .downcast_ref::<Int32Array>()
68            .ok_or_else(|| anyhow!("invalid work_id column"))?;
69        let cluster_col = batch
70            .column(2)
71            .as_any()
72            .downcast_ref::<Int32Array>()
73            .ok_or_else(|| anyhow!("invalid cluster column"))?;
74
75        for i in 0..batch.num_rows() {
76            let rec: BookLinkRecord = BookLinkRecord {
77                book_id: book_id_col.value(i),
78                work_id: if work_id_col.is_valid(i) {
79                    Some(work_id_col.value(i))
80                } else {
81                    None
82                },
83                cluster: cluster_col.value(i),
84            };
85            map.insert(rec.book_id, rec);
86        }
87    }
88
89    info!("read {} book links from {}", map.len(), GR_LINK_FILE);
90    Ok(map)
91}