bookdata/graph/
sources.rs1use std::fmt::Debug;
2
3use anyhow::Result;
4
5use polars::prelude::*;
6
7use crate::ids::codes::*;
8use crate::util::default;
9
10pub trait EdgeRead: Debug {
11 fn read_edges(&self) -> Result<LazyFrame>;
12}
13
14pub trait NodeRead: Debug {
15 fn read_node_ids(&self) -> Result<LazyFrame>;
16}
17
18#[derive(Debug)]
19pub struct ISBN;
20#[derive(Debug)]
21pub struct LOC;
22#[derive(Debug)]
23pub struct OLEditions;
24#[derive(Debug)]
25pub struct OLWorks;
26#[derive(Debug)]
27pub struct GRBooks;
28#[derive(Debug)]
29pub struct GRWorks;
30
31fn id_col(name: &str, ns: NS<'_>) -> Expr {
33 col(name) + lit(ns.base())
34}
35
36impl NodeRead for ISBN {
37 fn read_node_ids(&self) -> Result<LazyFrame> {
38 let df = LazyFrame::scan_parquet("book-links/all-isbns.parquet", default())?;
39 let df = df.select([
40 id_col("isbn_id", NS_ISBN).alias("code"),
41 col("isbn").alias("label"),
42 ]);
43 Ok(df)
44 }
45}
46
47impl NodeRead for LOC {
48 fn read_node_ids(&self) -> Result<LazyFrame> {
49 let df = LazyFrame::scan_parquet("loc-mds/book-ids.parquet", default())?;
50 let df = df.select([id_col("rec_id", NS_LOC_REC).alias("code")]);
51 Ok(df)
52 }
53}
54
55impl EdgeRead for LOC {
56 fn read_edges(&self) -> Result<LazyFrame> {
57 let df = LazyFrame::scan_parquet("loc-mds/book-isbn-ids.parquet", default())?;
58 let df = df.select([
59 id_col("isbn_id", NS_ISBN).alias("src"),
60 id_col("rec_id", NS_LOC_REC).alias("dst"),
61 ]);
62 Ok(df)
63 }
64}
65
66impl NodeRead for OLEditions {
67 fn read_node_ids(&self) -> Result<LazyFrame> {
68 let df = LazyFrame::scan_parquet("openlibrary/editions.parquet", default())?;
69 let df = df.select([id_col("id", NS_EDITION).alias("code")]);
70 Ok(df)
71 }
72}
73
74impl EdgeRead for OLEditions {
75 fn read_edges(&self) -> Result<LazyFrame> {
76 let df = LazyFrame::scan_parquet("openlibrary/edition-isbn-ids.parquet", default())?;
77 let df = df.select([
78 id_col("isbn_id", NS_ISBN).alias("src"),
79 id_col("edition", NS_EDITION).alias("dst"),
80 ]);
81 Ok(df)
82 }
83}
84
85impl NodeRead for OLWorks {
86 fn read_node_ids(&self) -> Result<LazyFrame> {
87 let wdf = LazyFrame::scan_parquet("openlibrary/works.parquet", default())?.select([
88 id_col("id", NS_WORK).alias("code"),
89 col("key").alias("label"),
90 ]);
91 let ewdf = LazyFrame::scan_parquet("openlibrary/edition-works.parquet", default())?
92 .select([id_col("work", NS_WORK).alias("code")])
93 .unique(None, UniqueKeepStrategy::Any);
94 let df = wdf.join(
95 ewdf,
96 [col("code")],
97 [col("code")],
98 JoinArgs::new(JoinType::Outer { coalesce: true }),
99 );
100 Ok(df)
101 }
102}
103
104impl EdgeRead for OLWorks {
105 fn read_edges(&self) -> Result<LazyFrame> {
106 let df = LazyFrame::scan_parquet("openlibrary/edition-works.parquet", default())?;
107 let df = df.select([
108 id_col("edition", NS_EDITION).alias("src"),
109 id_col("work", NS_WORK).alias("dst"),
110 ]);
111 Ok(df)
112 }
113}
114
115impl NodeRead for GRBooks {
116 fn read_node_ids(&self) -> Result<LazyFrame> {
117 let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
118 let df = df.select([id_col("book_id", NS_GR_BOOK).alias("code")]);
119 Ok(df)
120 }
121}
122
123impl EdgeRead for GRBooks {
124 fn read_edges(&self) -> Result<LazyFrame> {
125 let df = LazyFrame::scan_parquet("goodreads/book-isbn-ids.parquet", default())?;
126 let df = df.select([
127 id_col("isbn_id", NS_ISBN).alias("src"),
128 id_col("book_id", NS_GR_BOOK).alias("dst"),
129 ]);
130 Ok(df)
131 }
132}
133
134impl NodeRead for GRWorks {
135 fn read_node_ids(&self) -> Result<LazyFrame> {
136 let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
137 let df = df.filter(col("work_id").is_not_null());
138 let df = df.select([id_col("work_id", NS_GR_WORK).alias("code")]);
139 Ok(df)
140 }
141}
142
143impl EdgeRead for GRWorks {
144 fn read_edges(&self) -> Result<LazyFrame> {
145 let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
146 let df = df.filter(col("work_id").is_not_null());
147 let df = df.select([
148 id_col("book_id", NS_GR_BOOK).alias("src"),
149 id_col("work_id", NS_GR_WORK).alias("dst"),
150 ]);
151 Ok(df)
152 }
153}