bookdata/cli/cluster/author_gender/
authors.rs

1//! Support for loading author info.
2use std::collections::HashMap;
3
4use parquet_derive::ParquetRecordReader;
5
6use crate::arrow::*;
7use crate::gender::*;
8use crate::prelude::*;
9use crate::util::logging::item_progress;
10
11#[derive(Debug, Default)]
12pub struct AuthorInfo {
13    pub n_author_recs: u32,
14    pub genders: GenderBag,
15}
16
17pub type AuthorTable = HashMap<String, AuthorInfo>;
18
19#[derive(Debug, ParquetRecordReader)]
20struct NameRow {
21    rec_id: u32,
22    name: String,
23}
24
25#[derive(Debug, ParquetRecordReader)]
26struct GenderRow {
27    rec_id: u32,
28    gender: String,
29}
30
31/// Load VIAF author names.
32fn viaf_load_names() -> Result<HashMap<u32, Vec<String>>> {
33    let mut map: HashMap<u32, Vec<String>> = HashMap::new();
34
35    info!("loading VIAF author names");
36    let iter = scan_parquet_file("viaf/author-name-index.parquet")?;
37
38    let pb = item_progress(iter.remaining() as u64, "authors");
39    let iter = pb.wrap_iter(iter);
40    let timer = Timer::new();
41
42    for row in iter {
43        let row: NameRow = row?;
44        map.entry(row.rec_id).or_default().push(row.name);
45    }
46
47    info!(
48        "loaded authors for {} records in {}",
49        map.len(),
50        timer.human_elapsed()
51    );
52
53    Ok(map)
54}
55
56/// Load VIAF author genders
57fn viaf_load_genders() -> Result<HashMap<u32, GenderBag>> {
58    let mut map: HashMap<u32, GenderBag> = HashMap::new();
59    let timer = Timer::new();
60
61    info!("loading VIAF author genders");
62    let iter = scan_parquet_file("viaf/author-genders.parquet")?;
63
64    let pb = item_progress(iter.remaining(), "authors");
65    let iter = pb.wrap_iter(iter);
66
67    for row in iter {
68        let row: GenderRow = row?;
69        let gender: Gender = row.gender.into();
70        map.entry(row.rec_id).or_default().add(gender);
71    }
72
73    info!(
74        "loaded genders for {} records in {}",
75        map.len(),
76        timer.human_elapsed()
77    );
78
79    Ok(map)
80}
81
82/// Load the VIAF author gender records.
83#[inline(never)]
84pub fn viaf_author_table() -> Result<AuthorTable> {
85    let mut table = AuthorTable::new();
86
87    let rec_names = viaf_load_names()?;
88    let rec_genders = viaf_load_genders()?;
89
90    info!("merging gender records");
91    let pb = item_progress(rec_names.len() as u64, "clusters");
92    let timer = Timer::new();
93    for (rec_id, names) in pb.wrap_iter(rec_names.into_iter()) {
94        let genders = rec_genders.get(&rec_id);
95        for name in names {
96            let rec = table.entry(name).or_default();
97            rec.n_author_recs += 1;
98            if let Some(bag) = genders {
99                rec.genders.merge_from(bag);
100            }
101        }
102    }
103
104    info!("merged {} gender records in {}", table.len(), timer);
105
106    Ok(table)
107}