bookdata/marc/
book_fields.rs

1//! Code for writing extracted information specific to books.
2use parquet_derive::ParquetRecordWriter;
3use serde::Serialize;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::{parse_isbn_string, ParseResult};
7use crate::cleaning::names::clean_name;
8use crate::marc::flat_fields::FieldOutput;
9use crate::marc::MARCRecord;
10use crate::prelude::*;
11
12/// Structure recording book identifiers from a MARC field.
13#[derive(ParquetRecordWriter, Debug)]
14struct BookIds {
15    rec_id: u32,
16    marc_cn: String,
17    lccn: Option<String>,
18    status: u8,
19    rec_type: u8,
20    bib_level: u8,
21}
22
23/// Structure recording an ISBN record from a book.
24#[derive(Serialize, ParquetRecordWriter, Debug)]
25struct ISBNrec {
26    rec_id: u32,
27    isbn: String,
28    tag: Option<String>,
29}
30
31/// Structure recording a record's author field.
32#[derive(Serialize, ParquetRecordWriter, Debug)]
33struct AuthRec {
34    rec_id: u32,
35    author_name: String,
36}
37
38/// Output that writes books to set of Parquet files.
39pub struct BookOutput {
40    n_books: u32,
41    prefix: String,
42    fields: FieldOutput,
43    ids: TableWriter<BookIds>,
44    isbns: TableWriter<ISBNrec>,
45    authors: TableWriter<AuthRec>,
46}
47
48impl BookOutput {
49    pub fn open(prefix: &str) -> Result<BookOutput> {
50        let ffn = format!("{}-fields.parquet", prefix);
51        info!("writing book fields to {}", ffn);
52        let fields = TableWriter::open(ffn)?;
53        let fields = FieldOutput::new(fields);
54
55        let idfn = format!("{}-ids.parquet", prefix);
56        info!("writing book IDs to {}", idfn);
57        let ids = TableWriter::open(idfn)?;
58
59        let isbnfn = format!("{}-isbns.parquet", prefix);
60        info!("writing book IDs to {}", isbnfn);
61        let isbns = TableWriter::open(isbnfn)?;
62
63        let authfn = format!("{}-authors.parquet", prefix);
64        info!("writing book authors to {}", authfn);
65        let authors = TableWriter::open(authfn)?;
66
67        Ok(BookOutput {
68            n_books: 0,
69            prefix: prefix.to_string(),
70            fields,
71            ids,
72            isbns,
73            authors,
74        })
75    }
76}
77
78impl ObjectWriter<MARCRecord> for BookOutput {
79    fn write_object(&mut self, record: MARCRecord) -> Result<()> {
80        if !record.is_book() {
81            return Ok(());
82        }
83        self.n_books += 1;
84        let rec_id = self.n_books;
85
86        // scan for ISBNs and authors
87        for df in &record.fields {
88            // ISBNs: tag 20, subfield 'a'
89            if df.tag == 20 {
90                for sf in &df.subfields {
91                    if sf.code == 'a' {
92                        match parse_isbn_string(&sf.content) {
93                            ParseResult::Valid(isbns, _) => {
94                                for isbn in isbns {
95                                    if isbn.tags.len() > 0 {
96                                        for tag in isbn.tags {
97                                            self.isbns.write_object(ISBNrec {
98                                                rec_id,
99                                                isbn: isbn.text.clone(),
100                                                tag: Some(tag),
101                                            })?;
102                                        }
103                                    } else {
104                                        self.isbns.write_object(ISBNrec {
105                                            rec_id,
106                                            isbn: isbn.text,
107                                            tag: None,
108                                        })?;
109                                    }
110                                }
111                            }
112                            ParseResult::Ignored(_) => (),
113                            ParseResult::Unmatched(s) => {
114                                warn!("unmatched ISBN text {}", s)
115                            }
116                        }
117                    }
118                }
119            } else if df.tag == 100 {
120                // authors: tag 100, subfield a
121                for sf in &df.subfields {
122                    if sf.code == 'a' {
123                        let content = sf.content.trim();
124                        let author_name = clean_name(content);
125                        if !author_name.is_empty() {
126                            self.authors.write_object(AuthRec {
127                                rec_id,
128                                author_name,
129                            })?;
130                        }
131                    }
132                }
133            }
134        }
135
136        // emit book IDs
137        let ids = BookIds {
138            rec_id,
139            marc_cn: record
140                .marc_control()
141                .ok_or_else(|| anyhow!("no MARC control number"))?
142                .to_owned(),
143            lccn: record.lccn().map(|s| s.to_owned()),
144            status: record.rec_status().unwrap_or(0),
145            rec_type: record.rec_type().unwrap_or(0),
146            bib_level: record.rec_bib_level().unwrap_or(0),
147        };
148        self.ids.write_object(ids)?;
149
150        self.fields.write_object(record)?;
151        Ok(())
152    }
153
154    fn finish_objects(self) -> Result<usize> {
155        self.fields.finish_objects()?;
156        self.ids.finish_objects()?;
157        self.isbns.finish_objects()?;
158        self.authors.finish_objects()?;
159        Ok(self.n_books as usize)
160    }
161}