bookdata/graph/
sources.rs

1use std::fmt::Debug;
2
3use anyhow::Result;
4
5use polars::prelude::*;
6
7use crate::ids::codes::*;
8use crate::util::default;
9
10pub trait EdgeRead: Debug {
11    fn read_edges(&self) -> Result<LazyFrame>;
12}
13
14pub trait NodeRead: Debug {
15    fn read_node_ids(&self) -> Result<LazyFrame>;
16}
17
18#[derive(Debug)]
19pub struct ISBN;
20#[derive(Debug)]
21pub struct LOC;
22#[derive(Debug)]
23pub struct OLEditions;
24#[derive(Debug)]
25pub struct OLWorks;
26#[derive(Debug)]
27pub struct GRBooks;
28#[derive(Debug)]
29pub struct GRWorks;
30
31/// Get an ID column and apply the appropriate namespace adjustment.
32fn id_col(name: &str, ns: NS<'_>) -> Expr {
33    col(name) + lit(ns.base())
34}
35
36impl NodeRead for ISBN {
37    fn read_node_ids(&self) -> Result<LazyFrame> {
38        let df = LazyFrame::scan_parquet("book-links/all-isbns.parquet", default())?;
39        let df = df.select([
40            id_col("isbn_id", NS_ISBN).alias("code"),
41            col("isbn").alias("label"),
42        ]);
43        Ok(df)
44    }
45}
46
47impl NodeRead for LOC {
48    fn read_node_ids(&self) -> Result<LazyFrame> {
49        let df = LazyFrame::scan_parquet("loc-mds/book-ids.parquet", default())?;
50        let df = df.select([id_col("rec_id", NS_LOC_REC).alias("code")]);
51        Ok(df)
52    }
53}
54
55impl EdgeRead for LOC {
56    fn read_edges(&self) -> Result<LazyFrame> {
57        let df = LazyFrame::scan_parquet("loc-mds/book-isbn-ids.parquet", default())?;
58        let df = df.select([
59            id_col("isbn_id", NS_ISBN).alias("src"),
60            id_col("rec_id", NS_LOC_REC).alias("dst"),
61        ]);
62        Ok(df)
63    }
64}
65
66impl NodeRead for OLEditions {
67    fn read_node_ids(&self) -> Result<LazyFrame> {
68        let df = LazyFrame::scan_parquet("openlibrary/editions.parquet", default())?;
69        let df = df.select([id_col("id", NS_EDITION).alias("code")]);
70        Ok(df)
71    }
72}
73
74impl EdgeRead for OLEditions {
75    fn read_edges(&self) -> Result<LazyFrame> {
76        let df = LazyFrame::scan_parquet("openlibrary/edition-isbn-ids.parquet", default())?;
77        let df = df.select([
78            id_col("isbn_id", NS_ISBN).alias("src"),
79            id_col("edition", NS_EDITION).alias("dst"),
80        ]);
81        Ok(df)
82    }
83}
84
85impl NodeRead for OLWorks {
86    fn read_node_ids(&self) -> Result<LazyFrame> {
87        let wdf = LazyFrame::scan_parquet("openlibrary/works.parquet", default())?.select([
88            id_col("id", NS_WORK).alias("code"),
89            col("key").alias("label"),
90        ]);
91        let ewdf = LazyFrame::scan_parquet("openlibrary/edition-works.parquet", default())?
92            .select([id_col("work", NS_WORK).alias("code")])
93            .unique(None, UniqueKeepStrategy::Any);
94        let df = wdf.join(
95            ewdf,
96            [col("code")],
97            [col("code")],
98            JoinArgs::new(JoinType::Outer { coalesce: true }),
99        );
100        Ok(df)
101    }
102}
103
104impl EdgeRead for OLWorks {
105    fn read_edges(&self) -> Result<LazyFrame> {
106        let df = LazyFrame::scan_parquet("openlibrary/edition-works.parquet", default())?;
107        let df = df.select([
108            id_col("edition", NS_EDITION).alias("src"),
109            id_col("work", NS_WORK).alias("dst"),
110        ]);
111        Ok(df)
112    }
113}
114
115impl NodeRead for GRBooks {
116    fn read_node_ids(&self) -> Result<LazyFrame> {
117        let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
118        let df = df.select([id_col("book_id", NS_GR_BOOK).alias("code")]);
119        Ok(df)
120    }
121}
122
123impl EdgeRead for GRBooks {
124    fn read_edges(&self) -> Result<LazyFrame> {
125        let df = LazyFrame::scan_parquet("goodreads/book-isbn-ids.parquet", default())?;
126        let df = df.select([
127            id_col("isbn_id", NS_ISBN).alias("src"),
128            id_col("book_id", NS_GR_BOOK).alias("dst"),
129        ]);
130        Ok(df)
131    }
132}
133
134impl NodeRead for GRWorks {
135    fn read_node_ids(&self) -> Result<LazyFrame> {
136        let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
137        let df = df.filter(col("work_id").is_not_null());
138        let df = df.select([id_col("work_id", NS_GR_WORK).alias("code")]);
139        Ok(df)
140    }
141}
142
143impl EdgeRead for GRWorks {
144    fn read_edges(&self) -> Result<LazyFrame> {
145        let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
146        let df = df.filter(col("work_id").is_not_null());
147        let df = df.select([
148            id_col("book_id", NS_GR_BOOK).alias("src"),
149            id_col("work_id", NS_GR_WORK).alias("dst"),
150        ]);
151        Ok(df)
152    }
153}