1use std::fs::File;
2use std::io::Write;
3
4use anyhow::{anyhow, Result};
5use log::*;
6use parquet_derive::{ParquetRecordReader, ParquetRecordWriter};
7use serde::Serialize;
8
9use super::{BookID, IdGraph, IdNode};
10use crate::arrow::TableWriter;
11use crate::ids::codes::{ns_of_book_code, NS_ISBN};
12use crate::io::object::ObjectWriter;
13use crate::util::logging::item_progress;
14
15const ISBN_CLUSTER_PATH: &str = "book-links/isbn-clusters.parquet";
16const GRAPH_NODE_PATH: &str = "book-links/cluster-graph-nodes.parquet";
17const GRAPH_EDGE_PATH: &str = "book-links/cluster-graph-edges.parquet";
18const CLUSTER_STATS_PATH: &str = "book-links/cluster-stats.parquet";
19const CLUSTER_METRICS_PATH: &str = "book-links/cluster-metrics.json";
20
21#[derive(ParquetRecordWriter, ParquetRecordReader, Debug)]
22pub struct ISBNClusterRec {
23 pub isbn: String,
24 pub isbn_id: i32,
25 pub cluster: i32,
26}
27
28#[derive(ParquetRecordWriter, Debug)]
29pub struct ClusterCode {
30 pub book_code: i32,
31 pub cluster: i32,
32 pub node_type: String,
33 pub label: Option<String>,
34}
35
36#[derive(ParquetRecordWriter, Debug)]
37pub struct GraphEdge {
38 pub src: i32,
39 pub dst: i32,
40}
41
42#[derive(ParquetRecordWriter, Debug, Default)]
43pub struct ClusterStat {
44 pub cluster: i32,
45 pub n_nodes: u32,
46 pub n_isbns: u32,
47 pub n_loc_recs: u32,
48 pub n_ol_editions: u32,
49 pub n_ol_works: u32,
50 pub n_gr_books: u32,
51 pub n_gr_works: u32,
52}
53
54#[derive(Serialize, Debug)]
55struct ClusteringStatistics {
56 clusters: usize,
57 largest: usize,
58 max_isbns: usize,
59}
60
61impl ClusterStat {
62 pub fn create(cluster: i32, nodes: &Vec<&BookID>) -> ClusterStat {
64 let mut cs = ClusterStat::default();
65 cs.cluster = cluster;
66 cs.n_nodes = nodes.len() as u32;
67 for node in nodes {
68 if let Some(ns) = ns_of_book_code(node.code) {
69 match ns.name {
70 "ISBN" => cs.n_isbns += 1,
71 "LOC" => cs.n_loc_recs += 1,
72 "OL-W" => cs.n_ol_works += 1,
73 "OL-E" => cs.n_ol_editions += 1,
74 "GR-W" => cs.n_gr_works += 1,
75 "GR-B" => cs.n_gr_books += 1,
76 _ => (),
77 }
78 }
79 }
80
81 cs
82 }
83}
84
85pub fn save_graph_cluster_data(graph: &IdGraph, clusters: Vec<Vec<IdNode>>) -> Result<()> {
86 let mut ic_w = TableWriter::open(ISBN_CLUSTER_PATH)?;
87
88 let mut n_w = TableWriter::open(GRAPH_NODE_PATH)?;
89 let mut cs_w = TableWriter::open(CLUSTER_STATS_PATH)?;
90
91 let mut m_size = 0;
92 let mut m_id = 0;
93 let mut m_isbns = 0;
94
95 info!("writing graph nodes");
96 let pb = item_progress(clusters.len(), "clusters");
97 for ci in 0..clusters.len() {
98 let verts = &clusters[ci];
99 let vids: Vec<_> = verts
100 .iter()
101 .map(|v| graph.node_weight(*v).unwrap())
102 .collect();
103 let cluster = vids.iter().map(|b| b.code).min().unwrap();
104 if vids.len() > m_size {
105 m_size = vids.len();
106 m_id = cluster;
107 }
108 cs_w.write_object(ClusterStat::create(cluster, &vids))?;
109 let mut n_isbns = 0;
110 for v in &vids {
111 n_w.write_object(ClusterCode {
112 cluster,
113 book_code: v.code,
114 node_type: ns_of_book_code(v.code).unwrap().name.to_string(),
115 label: v.label.clone(),
116 })?;
117 if let Some(id) = NS_ISBN.from_code(v.code) {
118 ic_w.write_object(ISBNClusterRec {
119 cluster,
120 isbn_id: id,
121 isbn: v
122 .label
123 .clone()
124 .ok_or_else(|| anyhow!("graph node missing ISBN label"))?,
125 })?;
126 n_isbns += 1;
127 }
128 }
129 if n_isbns > m_isbns {
130 m_isbns = n_isbns;
131 }
132 pb.inc(1);
133 }
134
135 ic_w.finish()?;
136 n_w.finish()?;
137 cs_w.finish()?;
138 pb.finish_and_clear();
139
140 info!("largest cluster {} has {} nodes", m_id, m_size);
141
142 info!("writing graph edges");
143 let mut e_w = TableWriter::open(GRAPH_EDGE_PATH)?;
144 for e in graph.edge_indices() {
145 let (s, d) = graph.edge_endpoints(e).unwrap();
146 let src = graph.node_weight(s).unwrap().code;
147 let dst = graph.node_weight(d).unwrap().code;
148 e_w.write_object(GraphEdge { src, dst })?;
149 }
150 e_w.finish()?;
151
152 info!("saving statistics");
153 let stats = ClusteringStatistics {
154 clusters: clusters.len(),
155 largest: m_size,
156 max_isbns: m_isbns,
157 };
158 let mut statf = File::create(CLUSTER_METRICS_PATH)?;
159 serde_json::to_writer(&mut statf, &stats)?;
160 statf.write(b"\n")?;
161
162 Ok(())
163}