bookdata/cli/cluster/
hash.rs

1//! Extract author information for book clusters.
2use std::collections::HashMap;
3use std::path::PathBuf;
4
5use hex;
6use md5::{Digest, Md5};
7use parquet_derive::ParquetRecordWriter;
8
9use crate::arrow::*;
10use crate::prelude::*;
11use polars::prelude::*;
12
13use crate::prelude::Result;
14
15#[derive(Args, Debug)]
16#[command(name = "hash")]
17/// Compute a hash for each cluster.
18pub struct HashCmd {
19    /// Specify output file
20    #[arg(short = 'o', long = "output", name = "FILE")]
21    output: PathBuf,
22
23    /// Specify input file
24    #[arg(name = "ISBN_CLUSTERS")]
25    cluster_file: PathBuf,
26}
27
28#[derive(ParquetRecordWriter)]
29struct ClusterHash {
30    cluster: i32,
31    isbn_hash: String,
32    isbn_dcode: i8,
33}
34
35/// Load ISBN data
36fn scan_isbns(path: &Path) -> Result<LazyFrame> {
37    let path = path
38        .to_str()
39        .map(|s| s.to_string())
40        .ok_or(anyhow!("invalid UTF8 pathname"))?;
41    info!("scanning ISBN cluster file {}", path);
42    let icl = LazyFrame::scan_parquet(path, default())?;
43    let icl = icl.select(&[col("isbn"), col("cluster")]);
44    Ok(icl)
45}
46
47impl Command for HashCmd {
48    fn exec(&self) -> Result<()> {
49        let isbns = scan_isbns(self.cluster_file.as_path())?;
50
51        // It would be nice to do this with group-by, but group-by is quite slow and introduces
52        // unhelpful overhead. Sorting (for consistency) and a custom loop to aggregate ISBNs
53        // into hashes is much more efficient.
54        info!("reading sorted ISBNs into memory");
55        let isbns = isbns.sort("isbn", SortOptions::default()).collect()?;
56
57        info!("computing ISBN hashes");
58        let mut hashes: HashMap<i32, Md5> = HashMap::new();
59        let isbn_col = isbns.column("isbn")?.str()?;
60        let clus_col = isbns.column("cluster")?.i32()?;
61        for pair in isbn_col.into_iter().zip(clus_col.into_iter()) {
62            if let (Some(i), Some(c)) = pair {
63                hashes.entry(c).or_default().update(i.as_bytes());
64            }
65        }
66
67        info!("computed hashes for {} clusters", hashes.len());
68
69        let path = self.output.as_path();
70        info!("writing ISBN hashes to {:?}", path);
71        let mut writer = TableWriter::open(path)?;
72        for (cluster, h) in hashes.into_iter() {
73            let h = h.finalize();
74            writer.write_object(ClusterHash {
75                cluster,
76                isbn_hash: hex::encode(h),
77                isbn_dcode: (h[h.len() - 1] % 2) as i8,
78            })?;
79        }
80
81        writer.finish()?;
82
83        Ok(())
84    }
85}