bookdata/openlib/
author.rs

1//! OpenLibrary author schemas.
2use parquet_derive::ParquetRecordWriter;
3
4use crate::arrow::*;
5use crate::cleaning::names::clean_name;
6use crate::cleaning::strings::norm_unicode;
7use crate::prelude::*;
8
9use super::key::{parse_ol_key, KS_AUTHOR};
10pub use super::source::OLAuthorSource;
11use super::source::Row;
12
13/// An author record in the extracted Parquet.
14#[derive(ParquetRecordWriter)]
15pub struct AuthorRec {
16    pub id: u32,
17    pub key: String,
18    pub name: Option<String>,
19}
20
21/// An author-name record in the extracted Parquet.
22#[derive(ParquetRecordWriter)]
23pub struct AuthorNameRec {
24    pub id: u32,
25    pub source: u8,
26    pub name: String,
27}
28
29/// Get a list of author name records for an author.
30pub fn author_name_records(src: &OLAuthorSource, id: u32) -> Vec<AuthorNameRec> {
31    let mut names = Vec::new();
32
33    if let Some(n) = &src.name {
34        names.push(AuthorNameRec {
35            id,
36            source: b'n',
37            name: clean_name(&n),
38        });
39    }
40
41    if let Some(n) = &src.personal_name {
42        names.push(AuthorNameRec {
43            id,
44            source: b'p',
45            name: clean_name(&n),
46        });
47    }
48
49    for n in &src.alternate_names {
50        names.push(AuthorNameRec {
51            id,
52            source: b'a',
53            name: clean_name(&n),
54        });
55    }
56
57    names
58}
59
60/// Process author records into Parquet.
61pub struct AuthorProcessor {
62    rec_writer: TableWriter<AuthorRec>,
63    name_writer: TableWriter<AuthorNameRec>,
64}
65
66impl AuthorProcessor {
67    pub fn new() -> Result<AuthorProcessor> {
68        Ok(AuthorProcessor {
69            rec_writer: TableWriter::open("authors.parquet")?,
70            name_writer: TableWriter::open("author-names.parquet")?,
71        })
72    }
73}
74
75impl ObjectWriter<Row<OLAuthorSource>> for AuthorProcessor {
76    fn write_object(&mut self, row: Row<OLAuthorSource>) -> Result<()> {
77        let id = parse_ol_key(&row.key, KS_AUTHOR)?;
78
79        self.rec_writer.write_object(AuthorRec {
80            id,
81            key: row.key,
82            name: row
83                .record
84                .name
85                .as_ref()
86                .map(|s| norm_unicode(s).into_owned()),
87        })?;
88
89        for name in author_name_records(&row.record, id) {
90            self.name_writer.write_object(name)?;
91        }
92
93        Ok(())
94    }
95
96    fn finish(self) -> Result<usize> {
97        let nr = self.rec_writer.finish()?;
98        self.name_writer.finish()?;
99        Ok(nr)
100    }
101}