bookdata/cli/cluster/author_gender/
authors.rs1use std::collections::HashMap;
3
4use parquet_derive::ParquetRecordReader;
5
6use crate::arrow::*;
7use crate::gender::*;
8use crate::prelude::*;
9use crate::util::logging::item_progress;
10
11#[derive(Debug, Default)]
12pub struct AuthorInfo {
13 pub n_author_recs: u32,
14 pub genders: GenderBag,
15}
16
17pub type AuthorTable = HashMap<String, AuthorInfo>;
18
19#[derive(Debug, ParquetRecordReader)]
20struct NameRow {
21 rec_id: u32,
22 name: String,
23}
24
25#[derive(Debug, ParquetRecordReader)]
26struct GenderRow {
27 rec_id: u32,
28 gender: String,
29}
30
31fn viaf_load_names() -> Result<HashMap<u32, Vec<String>>> {
33 let mut map: HashMap<u32, Vec<String>> = HashMap::new();
34
35 info!("loading VIAF author names");
36 let iter = scan_parquet_file("viaf/author-name-index.parquet")?;
37
38 let pb = item_progress(iter.remaining() as u64, "authors");
39 let iter = pb.wrap_iter(iter);
40 let timer = Timer::new();
41
42 for row in iter {
43 let row: NameRow = row?;
44 map.entry(row.rec_id).or_default().push(row.name);
45 }
46
47 info!(
48 "loaded authors for {} records in {}",
49 map.len(),
50 timer.human_elapsed()
51 );
52
53 Ok(map)
54}
55
56fn viaf_load_genders() -> Result<HashMap<u32, GenderBag>> {
58 let mut map: HashMap<u32, GenderBag> = HashMap::new();
59 let timer = Timer::new();
60
61 info!("loading VIAF author genders");
62 let iter = scan_parquet_file("viaf/author-genders.parquet")?;
63
64 let pb = item_progress(iter.remaining(), "authors");
65 let iter = pb.wrap_iter(iter);
66
67 for row in iter {
68 let row: GenderRow = row?;
69 let gender: Gender = row.gender.into();
70 map.entry(row.rec_id).or_default().add(gender);
71 }
72
73 info!(
74 "loaded genders for {} records in {}",
75 map.len(),
76 timer.human_elapsed()
77 );
78
79 Ok(map)
80}
81
82#[inline(never)]
84pub fn viaf_author_table() -> Result<AuthorTable> {
85 let mut table = AuthorTable::new();
86
87 let rec_names = viaf_load_names()?;
88 let rec_genders = viaf_load_genders()?;
89
90 info!("merging gender records");
91 let pb = item_progress(rec_names.len() as u64, "clusters");
92 let timer = Timer::new();
93 for (rec_id, names) in pb.wrap_iter(rec_names.into_iter()) {
94 let genders = rec_genders.get(&rec_id);
95 for name in names {
96 let rec = table.entry(name).or_default();
97 rec.n_author_recs += 1;
98 if let Some(bag) = genders {
99 rec.genders.merge_from(bag);
100 }
101 }
102 }
103
104 info!("merged {} gender records in {}", table.len(), timer);
105
106 Ok(table)
107}