bookdata/cli/cluster/
hash.rs1use std::collections::HashMap;
3use std::path::PathBuf;
4
5use hex;
6use md5::{Digest, Md5};
7use parquet_derive::ParquetRecordWriter;
8
9use crate::arrow::*;
10use crate::prelude::*;
11use polars::prelude::*;
12
13use crate::prelude::Result;
14
15#[derive(Args, Debug)]
16#[command(name = "hash")]
17pub struct HashCmd {
19 #[arg(short = 'o', long = "output", name = "FILE")]
21 output: PathBuf,
22
23 #[arg(name = "ISBN_CLUSTERS")]
25 cluster_file: PathBuf,
26}
27
28#[derive(ParquetRecordWriter)]
29struct ClusterHash {
30 cluster: i32,
31 isbn_hash: String,
32 isbn_dcode: i8,
33}
34
35fn scan_isbns(path: &Path) -> Result<LazyFrame> {
37 let path = path
38 .to_str()
39 .map(|s| s.to_string())
40 .ok_or(anyhow!("invalid UTF8 pathname"))?;
41 info!("scanning ISBN cluster file {}", path);
42 let icl = LazyFrame::scan_parquet(path, default())?;
43 let icl = icl.select(&[col("isbn"), col("cluster")]);
44 Ok(icl)
45}
46
47impl Command for HashCmd {
48 fn exec(&self) -> Result<()> {
49 let isbns = scan_isbns(self.cluster_file.as_path())?;
50
51 info!("reading sorted ISBNs into memory");
55 let isbns = isbns.sort("isbn", SortOptions::default()).collect()?;
56
57 info!("computing ISBN hashes");
58 let mut hashes: HashMap<i32, Md5> = HashMap::new();
59 let isbn_col = isbns.column("isbn")?.str()?;
60 let clus_col = isbns.column("cluster")?.i32()?;
61 for pair in isbn_col.into_iter().zip(clus_col.into_iter()) {
62 if let (Some(i), Some(c)) = pair {
63 hashes.entry(c).or_default().update(i.as_bytes());
64 }
65 }
66
67 info!("computed hashes for {} clusters", hashes.len());
68
69 let path = self.output.as_path();
70 info!("writing ISBN hashes to {:?}", path);
71 let mut writer = TableWriter::open(path)?;
72 for (cluster, h) in hashes.into_iter() {
73 let h = h.finalize();
74 writer.write_object(ClusterHash {
75 cluster,
76 isbn_hash: hex::encode(h),
77 isbn_dcode: (h[h.len() - 1] % 2) as i8,
78 })?;
79 }
80
81 writer.finish()?;
82
83 Ok(())
84 }
85}