1use std::fs::File;
2
3use anyhow::{anyhow, Result};
4use log::*;
5use parquet_derive::{ParquetRecordReader, ParquetRecordWriter};
6use serde::Serialize;
7
8use super::{BookID, IdGraph, IdNode};
9use crate::arrow::TableWriter;
10use crate::ids::codes::{ns_of_book_code, NS_ISBN};
11use crate::io::object::ObjectWriter;
12use crate::util::logging::item_progress;
13
14const ISBN_CLUSTER_PATH: &str = "book-links/isbn-clusters.parquet";
15const GRAPH_NODE_PATH: &str = "book-links/cluster-graph-nodes.parquet";
16const GRAPH_EDGE_PATH: &str = "book-links/cluster-graph-edges.parquet";
17const CLUSTER_STATS_PATH: &str = "book-links/cluster-stats.parquet";
18const CLUSTER_METRICS_PATH: &str = "book-links/cluster-metrics.json";
19
20#[derive(ParquetRecordWriter, ParquetRecordReader, Debug)]
21pub struct ISBNClusterRec {
22 pub isbn: String,
23 pub isbn_id: i32,
24 pub cluster: i32,
25}
26
27#[derive(ParquetRecordWriter, Debug)]
28pub struct ClusterCode {
29 pub book_code: i32,
30 pub cluster: i32,
31 pub node_type: String,
32 pub label: Option<String>,
33}
34
35#[derive(ParquetRecordWriter, Debug)]
36pub struct GraphEdge {
37 pub src: i32,
38 pub dst: i32,
39}
40
41#[derive(ParquetRecordWriter, Debug, Default)]
42pub struct ClusterStat {
43 pub cluster: i32,
44 pub n_nodes: u32,
45 pub n_isbns: u32,
46 pub n_loc_recs: u32,
47 pub n_ol_editions: u32,
48 pub n_ol_works: u32,
49 pub n_gr_books: u32,
50 pub n_gr_works: u32,
51}
52
53#[derive(Serialize, Debug)]
54struct ClusteringStatistics {
55 clusters: usize,
56 largest: usize,
57 max_isbns: usize,
58}
59
60impl ClusterStat {
61 pub fn create(cluster: i32, nodes: &Vec<&BookID>) -> ClusterStat {
63 let mut cs = ClusterStat::default();
64 cs.cluster = cluster;
65 cs.n_nodes = nodes.len() as u32;
66 for node in nodes {
67 if let Some(ns) = ns_of_book_code(node.code) {
68 match ns.name {
69 "ISBN" => cs.n_isbns += 1,
70 "LOC" => cs.n_loc_recs += 1,
71 "OL-W" => cs.n_ol_works += 1,
72 "OL-E" => cs.n_ol_editions += 1,
73 "GR-W" => cs.n_gr_works += 1,
74 "GR-B" => cs.n_gr_books += 1,
75 _ => (),
76 }
77 }
78 }
79
80 cs
81 }
82}
83
84pub fn save_graph_cluster_data(graph: &IdGraph, clusters: Vec<Vec<IdNode>>) -> Result<()> {
85 let mut ic_w = TableWriter::open(ISBN_CLUSTER_PATH)?;
86
87 let mut n_w = TableWriter::open(GRAPH_NODE_PATH)?;
88 let mut cs_w = TableWriter::open(CLUSTER_STATS_PATH)?;
89
90 let mut m_size = 0;
91 let mut m_id = 0;
92 let mut m_isbns = 0;
93
94 info!("writing graph nodes");
95 let pb = item_progress(clusters.len(), "clusters");
96 for ci in 0..clusters.len() {
97 let verts = &clusters[ci];
98 let vids: Vec<_> = verts
99 .iter()
100 .map(|v| graph.node_weight(*v).unwrap())
101 .collect();
102 let cluster = vids.iter().map(|b| b.code).min().unwrap();
103 if vids.len() > m_size {
104 m_size = vids.len();
105 m_id = cluster;
106 }
107 cs_w.write_object(ClusterStat::create(cluster, &vids))?;
108 let mut n_isbns = 0;
109 for v in &vids {
110 n_w.write_object(ClusterCode {
111 cluster,
112 book_code: v.code,
113 node_type: ns_of_book_code(v.code).unwrap().name.to_string(),
114 label: v.label.clone(),
115 })?;
116 if let Some(id) = NS_ISBN.from_code(v.code) {
117 ic_w.write_object(ISBNClusterRec {
118 cluster,
119 isbn_id: id,
120 isbn: v
121 .label
122 .clone()
123 .ok_or_else(|| anyhow!("graph node missing ISBN label"))?,
124 })?;
125 n_isbns += 1;
126 }
127 }
128 if n_isbns > m_isbns {
129 m_isbns = n_isbns;
130 }
131 pb.inc(1);
132 }
133
134 ic_w.finish()?;
135 n_w.finish()?;
136 cs_w.finish()?;
137 pb.finish_and_clear();
138
139 info!("largest cluster {} has {} nodes", m_id, m_size);
140
141 info!("writing graph edges");
142 let mut e_w = TableWriter::open(GRAPH_EDGE_PATH)?;
143 for e in graph.edge_indices() {
144 let (s, d) = graph.edge_endpoints(e).unwrap();
145 let src = graph.node_weight(s).unwrap().code;
146 let dst = graph.node_weight(d).unwrap().code;
147 e_w.write_object(GraphEdge { src, dst })?;
148 }
149 e_w.finish()?;
150
151 info!("saving statistics");
152 let stats = ClusteringStatistics {
153 clusters: clusters.len(),
154 largest: m_size,
155 max_isbns: m_isbns,
156 };
157 let statf = File::create(CLUSTER_METRICS_PATH)?;
158 serde_json::to_writer(statf, &stats)?;
159
160 Ok(())
161}