bookdata/graph/
model.rs

1use std::fs::File;
2use std::io::Write;
3
4use anyhow::{anyhow, Result};
5use log::*;
6use parquet_derive::{ParquetRecordReader, ParquetRecordWriter};
7use serde::Serialize;
8
9use super::{BookID, IdGraph, IdNode};
10use crate::arrow::TableWriter;
11use crate::ids::codes::{ns_of_book_code, NS_ISBN};
12use crate::io::object::ObjectWriter;
13use crate::util::logging::item_progress;
14
15const ISBN_CLUSTER_PATH: &str = "book-links/isbn-clusters.parquet";
16const GRAPH_NODE_PATH: &str = "book-links/cluster-graph-nodes.parquet";
17const GRAPH_EDGE_PATH: &str = "book-links/cluster-graph-edges.parquet";
18const CLUSTER_STATS_PATH: &str = "book-links/cluster-stats.parquet";
19const CLUSTER_METRICS_PATH: &str = "book-links/cluster-metrics.json";
20
21#[derive(ParquetRecordWriter, ParquetRecordReader, Debug)]
22pub struct ISBNClusterRec {
23    pub isbn: String,
24    pub isbn_id: i32,
25    pub cluster: i32,
26}
27
28#[derive(ParquetRecordWriter, Debug)]
29pub struct ClusterCode {
30    pub book_code: i32,
31    pub cluster: i32,
32    pub node_type: String,
33    pub label: Option<String>,
34}
35
36#[derive(ParquetRecordWriter, Debug)]
37pub struct GraphEdge {
38    pub src: i32,
39    pub dst: i32,
40}
41
42#[derive(ParquetRecordWriter, Debug, Default)]
43pub struct ClusterStat {
44    pub cluster: i32,
45    pub n_nodes: u32,
46    pub n_isbns: u32,
47    pub n_loc_recs: u32,
48    pub n_ol_editions: u32,
49    pub n_ol_works: u32,
50    pub n_gr_books: u32,
51    pub n_gr_works: u32,
52}
53
54#[derive(Serialize, Debug)]
55struct ClusteringStatistics {
56    clusters: usize,
57    largest: usize,
58    max_isbns: usize,
59}
60
61impl ClusterStat {
62    /// Create a cluster statistics object from a list of books codes.
63    pub fn create(cluster: i32, nodes: &Vec<&BookID>) -> ClusterStat {
64        let mut cs = ClusterStat::default();
65        cs.cluster = cluster;
66        cs.n_nodes = nodes.len() as u32;
67        for node in nodes {
68            if let Some(ns) = ns_of_book_code(node.code) {
69                match ns.name {
70                    "ISBN" => cs.n_isbns += 1,
71                    "LOC" => cs.n_loc_recs += 1,
72                    "OL-W" => cs.n_ol_works += 1,
73                    "OL-E" => cs.n_ol_editions += 1,
74                    "GR-W" => cs.n_gr_works += 1,
75                    "GR-B" => cs.n_gr_books += 1,
76                    _ => (),
77                }
78            }
79        }
80
81        cs
82    }
83}
84
85pub fn save_graph_cluster_data(graph: &IdGraph, clusters: Vec<Vec<IdNode>>) -> Result<()> {
86    let mut ic_w = TableWriter::open(ISBN_CLUSTER_PATH)?;
87
88    let mut n_w = TableWriter::open(GRAPH_NODE_PATH)?;
89    let mut cs_w = TableWriter::open(CLUSTER_STATS_PATH)?;
90
91    let mut m_size = 0;
92    let mut m_id = 0;
93    let mut m_isbns = 0;
94
95    info!("writing graph nodes");
96    let pb = item_progress(clusters.len(), "clusters");
97    for ci in 0..clusters.len() {
98        let verts = &clusters[ci];
99        let vids: Vec<_> = verts
100            .iter()
101            .map(|v| graph.node_weight(*v).unwrap())
102            .collect();
103        let cluster = vids.iter().map(|b| b.code).min().unwrap();
104        if vids.len() > m_size {
105            m_size = vids.len();
106            m_id = cluster;
107        }
108        cs_w.write_object(ClusterStat::create(cluster, &vids))?;
109        let mut n_isbns = 0;
110        for v in &vids {
111            n_w.write_object(ClusterCode {
112                cluster,
113                book_code: v.code,
114                node_type: ns_of_book_code(v.code).unwrap().name.to_string(),
115                label: v.label.clone(),
116            })?;
117            if let Some(id) = NS_ISBN.from_code(v.code) {
118                ic_w.write_object(ISBNClusterRec {
119                    cluster,
120                    isbn_id: id,
121                    isbn: v
122                        .label
123                        .clone()
124                        .ok_or_else(|| anyhow!("graph node missing ISBN label"))?,
125                })?;
126                n_isbns += 1;
127            }
128        }
129        if n_isbns > m_isbns {
130            m_isbns = n_isbns;
131        }
132        pb.inc(1);
133    }
134
135    ic_w.finish()?;
136    n_w.finish()?;
137    cs_w.finish()?;
138    pb.finish_and_clear();
139
140    info!("largest cluster {} has {} nodes", m_id, m_size);
141
142    info!("writing graph edges");
143    let mut e_w = TableWriter::open(GRAPH_EDGE_PATH)?;
144    for e in graph.edge_indices() {
145        let (s, d) = graph.edge_endpoints(e).unwrap();
146        let src = graph.node_weight(s).unwrap().code;
147        let dst = graph.node_weight(d).unwrap().code;
148        e_w.write_object(GraphEdge { src, dst })?;
149    }
150    e_w.finish()?;
151
152    info!("saving statistics");
153    let stats = ClusteringStatistics {
154        clusters: clusters.len(),
155        largest: m_size,
156        max_isbns: m_isbns,
157    };
158    let mut statf = File::create(CLUSTER_METRICS_PATH)?;
159    serde_json::to_writer(&mut statf, &stats)?;
160    statf.write(b"\n")?;
161
162    Ok(())
163}