bookdata/marc/
book_fields.rs

1//! Code for writing extracted information specific to books.
2use parquet_derive::ParquetRecordWriter;
3use serde::Serialize;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::{parse_isbn_string, ParseResult};
7use crate::cleaning::names::clean_name;
8use crate::marc::flat_fields::FieldOutput;
9use crate::marc::MARCRecord;
10use crate::prelude::*;
11
12/// Structure recording book identifiers from a MARC field.
13#[derive(ParquetRecordWriter, Debug)]
14struct BookIds {
15    rec_id: u32,
16    marc_cn: String,
17    lccn: Option<String>,
18    status: u8,
19    rec_type: u8,
20    bib_level: u8,
21}
22
23/// Structure recording an ISBN record from a book.
24#[derive(Serialize, ParquetRecordWriter, Debug)]
25struct ISBNrec {
26    rec_id: u32,
27    isbn: String,
28    tag: Option<String>,
29}
30
31/// Structure recording a record's author field.
32#[derive(Serialize, ParquetRecordWriter, Debug)]
33struct AuthRec {
34    rec_id: u32,
35    author_name: String,
36}
37
38/// Output that writes books to set of Parquet files.
39pub struct BookOutput {
40    n_books: u32,
41    prefix: String,
42    fields: FieldOutput,
43    ids: TableWriter<BookIds>,
44    isbns: TableWriter<ISBNrec>,
45    authors: TableWriter<AuthRec>,
46}
47
48impl BookOutput {
49    pub fn open(prefix: &str) -> Result<BookOutput> {
50        let ffn = format!("{}-fields.parquet", prefix);
51        info!("writing book fields to {}", ffn);
52        let fields = TableWriter::open(ffn)?;
53        let fields = FieldOutput::new(fields);
54
55        let idfn = format!("{}-ids.parquet", prefix);
56        info!("writing book IDs to {}", idfn);
57        let ids = TableWriter::open(idfn)?;
58
59        let isbnfn = format!("{}-isbns.parquet", prefix);
60        info!("writing book IDs to {}", isbnfn);
61        let isbns = TableWriter::open(isbnfn)?;
62
63        let authfn = format!("{}-authors.parquet", prefix);
64        info!("writing book authors to {}", authfn);
65        let authors = TableWriter::open(authfn)?;
66
67        Ok(BookOutput {
68            n_books: 0,
69            prefix: prefix.to_string(),
70            fields,
71            ids,
72            isbns,
73            authors,
74        })
75    }
76}
77
78impl DataSink for BookOutput {
79    fn output_files(&self) -> Vec<PathBuf> {
80        vec![
81            format!("{}-fields.parquet", &self.prefix).into(),
82            format!("{}-ids.parquet", &self.prefix).into(),
83            format!("{}-isbns.parquet", &self.prefix).into(),
84        ]
85    }
86}
87
88impl ObjectWriter<MARCRecord> for BookOutput {
89    fn write_object(&mut self, record: MARCRecord) -> Result<()> {
90        if !record.is_book() {
91            return Ok(());
92        }
93        self.n_books += 1;
94        let rec_id = self.n_books;
95
96        // scan for ISBNs and authors
97        for df in &record.fields {
98            // ISBNs: tag 20, subfield 'a'
99            if df.tag == 20 {
100                for sf in &df.subfields {
101                    if sf.code == 'a' {
102                        match parse_isbn_string(&sf.content) {
103                            ParseResult::Valid(isbns, _) => {
104                                for isbn in isbns {
105                                    if isbn.tags.len() > 0 {
106                                        for tag in isbn.tags {
107                                            self.isbns.write_object(ISBNrec {
108                                                rec_id,
109                                                isbn: isbn.text.clone(),
110                                                tag: Some(tag),
111                                            })?;
112                                        }
113                                    } else {
114                                        self.isbns.write_object(ISBNrec {
115                                            rec_id,
116                                            isbn: isbn.text,
117                                            tag: None,
118                                        })?;
119                                    }
120                                }
121                            }
122                            ParseResult::Ignored(_) => (),
123                            ParseResult::Unmatched(s) => {
124                                warn!("unmatched ISBN text {}", s)
125                            }
126                        }
127                    }
128                }
129            } else if df.tag == 100 {
130                // authors: tag 100, subfield a
131                for sf in &df.subfields {
132                    if sf.code == 'a' {
133                        let content = sf.content.trim();
134                        let author_name = clean_name(content);
135                        if !author_name.is_empty() {
136                            self.authors.write_object(AuthRec {
137                                rec_id,
138                                author_name,
139                            })?;
140                        }
141                    }
142                }
143            }
144        }
145
146        // emit book IDs
147        let ids = BookIds {
148            rec_id,
149            marc_cn: record
150                .marc_control()
151                .ok_or_else(|| anyhow!("no MARC control number"))?
152                .to_owned(),
153            lccn: record.lccn().map(|s| s.to_owned()),
154            status: record.rec_status().unwrap_or(0),
155            rec_type: record.rec_type().unwrap_or(0),
156            bib_level: record.rec_bib_level().unwrap_or(0),
157        };
158        self.ids.write_object(ids)?;
159
160        self.fields.write_object(record)?;
161        Ok(())
162    }
163
164    fn finish(self) -> Result<usize> {
165        self.fields.finish()?;
166        self.ids.finish()?;
167        self.isbns.finish()?;
168        self.authors.finish()?;
169        Ok(self.n_books as usize)
170    }
171}