bookdata/graph/
model.rs

1use std::fs::File;
2
3use anyhow::{anyhow, Result};
4use log::*;
5use parquet_derive::{ParquetRecordReader, ParquetRecordWriter};
6use serde::Serialize;
7
8use super::{BookID, IdGraph, IdNode};
9use crate::arrow::TableWriter;
10use crate::ids::codes::{ns_of_book_code, NS_ISBN};
11use crate::io::object::ObjectWriter;
12use crate::util::logging::item_progress;
13
14const ISBN_CLUSTER_PATH: &str = "book-links/isbn-clusters.parquet";
15const GRAPH_NODE_PATH: &str = "book-links/cluster-graph-nodes.parquet";
16const GRAPH_EDGE_PATH: &str = "book-links/cluster-graph-edges.parquet";
17const CLUSTER_STATS_PATH: &str = "book-links/cluster-stats.parquet";
18const CLUSTER_METRICS_PATH: &str = "book-links/cluster-metrics.json";
19
20#[derive(ParquetRecordWriter, ParquetRecordReader, Debug)]
21pub struct ISBNClusterRec {
22    pub isbn: String,
23    pub isbn_id: i32,
24    pub cluster: i32,
25}
26
27#[derive(ParquetRecordWriter, Debug)]
28pub struct ClusterCode {
29    pub book_code: i32,
30    pub cluster: i32,
31    pub node_type: String,
32    pub label: Option<String>,
33}
34
35#[derive(ParquetRecordWriter, Debug)]
36pub struct GraphEdge {
37    pub src: i32,
38    pub dst: i32,
39}
40
41#[derive(ParquetRecordWriter, Debug, Default)]
42pub struct ClusterStat {
43    pub cluster: i32,
44    pub n_nodes: u32,
45    pub n_isbns: u32,
46    pub n_loc_recs: u32,
47    pub n_ol_editions: u32,
48    pub n_ol_works: u32,
49    pub n_gr_books: u32,
50    pub n_gr_works: u32,
51}
52
53#[derive(Serialize, Debug)]
54struct ClusteringStatistics {
55    clusters: usize,
56    largest: usize,
57    max_isbns: usize,
58}
59
60impl ClusterStat {
61    /// Create a cluster statistics object from a list of books codes.
62    pub fn create(cluster: i32, nodes: &Vec<&BookID>) -> ClusterStat {
63        let mut cs = ClusterStat::default();
64        cs.cluster = cluster;
65        cs.n_nodes = nodes.len() as u32;
66        for node in nodes {
67            if let Some(ns) = ns_of_book_code(node.code) {
68                match ns.name {
69                    "ISBN" => cs.n_isbns += 1,
70                    "LOC" => cs.n_loc_recs += 1,
71                    "OL-W" => cs.n_ol_works += 1,
72                    "OL-E" => cs.n_ol_editions += 1,
73                    "GR-W" => cs.n_gr_works += 1,
74                    "GR-B" => cs.n_gr_books += 1,
75                    _ => (),
76                }
77            }
78        }
79
80        cs
81    }
82}
83
84pub fn save_graph_cluster_data(graph: &IdGraph, clusters: Vec<Vec<IdNode>>) -> Result<()> {
85    let mut ic_w = TableWriter::open(ISBN_CLUSTER_PATH)?;
86
87    let mut n_w = TableWriter::open(GRAPH_NODE_PATH)?;
88    let mut cs_w = TableWriter::open(CLUSTER_STATS_PATH)?;
89
90    let mut m_size = 0;
91    let mut m_id = 0;
92    let mut m_isbns = 0;
93
94    info!("writing graph nodes");
95    let pb = item_progress(clusters.len(), "clusters");
96    for ci in 0..clusters.len() {
97        let verts = &clusters[ci];
98        let vids: Vec<_> = verts
99            .iter()
100            .map(|v| graph.node_weight(*v).unwrap())
101            .collect();
102        let cluster = vids.iter().map(|b| b.code).min().unwrap();
103        if vids.len() > m_size {
104            m_size = vids.len();
105            m_id = cluster;
106        }
107        cs_w.write_object(ClusterStat::create(cluster, &vids))?;
108        let mut n_isbns = 0;
109        for v in &vids {
110            n_w.write_object(ClusterCode {
111                cluster,
112                book_code: v.code,
113                node_type: ns_of_book_code(v.code).unwrap().name.to_string(),
114                label: v.label.clone(),
115            })?;
116            if let Some(id) = NS_ISBN.from_code(v.code) {
117                ic_w.write_object(ISBNClusterRec {
118                    cluster,
119                    isbn_id: id,
120                    isbn: v
121                        .label
122                        .clone()
123                        .ok_or_else(|| anyhow!("graph node missing ISBN label"))?,
124                })?;
125                n_isbns += 1;
126            }
127        }
128        if n_isbns > m_isbns {
129            m_isbns = n_isbns;
130        }
131        pb.inc(1);
132    }
133
134    ic_w.finish()?;
135    n_w.finish()?;
136    cs_w.finish()?;
137    pb.finish_and_clear();
138
139    info!("largest cluster {} has {} nodes", m_id, m_size);
140
141    info!("writing graph edges");
142    let mut e_w = TableWriter::open(GRAPH_EDGE_PATH)?;
143    for e in graph.edge_indices() {
144        let (s, d) = graph.edge_endpoints(e).unwrap();
145        let src = graph.node_weight(s).unwrap().code;
146        let dst = graph.node_weight(d).unwrap().code;
147        e_w.write_object(GraphEdge { src, dst })?;
148    }
149    e_w.finish()?;
150
151    info!("saving statistics");
152    let stats = ClusteringStatistics {
153        clusters: clusters.len(),
154        largest: m_size,
155        max_isbns: m_isbns,
156    };
157    let statf = File::create(CLUSTER_METRICS_PATH)?;
158    serde_json::to_writer(statf, &stats)?;
159
160    Ok(())
161}