bookdata/marc/
book_fields.rs1use parquet_derive::ParquetRecordWriter;
3use serde::Serialize;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::{parse_isbn_string, ParseResult};
7use crate::cleaning::names::clean_name;
8use crate::marc::flat_fields::FieldOutput;
9use crate::marc::MARCRecord;
10use crate::prelude::*;
11
12#[derive(ParquetRecordWriter, Debug)]
14struct BookIds {
15 rec_id: u32,
16 marc_cn: String,
17 lccn: Option<String>,
18 status: u8,
19 rec_type: u8,
20 bib_level: u8,
21}
22
23#[derive(Serialize, ParquetRecordWriter, Debug)]
25struct ISBNrec {
26 rec_id: u32,
27 isbn: String,
28 tag: Option<String>,
29}
30
31#[derive(Serialize, ParquetRecordWriter, Debug)]
33struct AuthRec {
34 rec_id: u32,
35 author_name: String,
36}
37
38pub struct BookOutput {
40 n_books: u32,
41 prefix: String,
42 fields: FieldOutput,
43 ids: TableWriter<BookIds>,
44 isbns: TableWriter<ISBNrec>,
45 authors: TableWriter<AuthRec>,
46}
47
48impl BookOutput {
49 pub fn open(prefix: &str) -> Result<BookOutput> {
50 let ffn = format!("{}-fields.parquet", prefix);
51 info!("writing book fields to {}", ffn);
52 let fields = TableWriter::open(ffn)?;
53 let fields = FieldOutput::new(fields);
54
55 let idfn = format!("{}-ids.parquet", prefix);
56 info!("writing book IDs to {}", idfn);
57 let ids = TableWriter::open(idfn)?;
58
59 let isbnfn = format!("{}-isbns.parquet", prefix);
60 info!("writing book IDs to {}", isbnfn);
61 let isbns = TableWriter::open(isbnfn)?;
62
63 let authfn = format!("{}-authors.parquet", prefix);
64 info!("writing book authors to {}", authfn);
65 let authors = TableWriter::open(authfn)?;
66
67 Ok(BookOutput {
68 n_books: 0,
69 prefix: prefix.to_string(),
70 fields,
71 ids,
72 isbns,
73 authors,
74 })
75 }
76}
77
78impl DataSink for BookOutput {
79 fn output_files(&self) -> Vec<PathBuf> {
80 vec![
81 format!("{}-fields.parquet", &self.prefix).into(),
82 format!("{}-ids.parquet", &self.prefix).into(),
83 format!("{}-isbns.parquet", &self.prefix).into(),
84 ]
85 }
86}
87
88impl ObjectWriter<MARCRecord> for BookOutput {
89 fn write_object(&mut self, record: MARCRecord) -> Result<()> {
90 if !record.is_book() {
91 return Ok(());
92 }
93 self.n_books += 1;
94 let rec_id = self.n_books;
95
96 for df in &record.fields {
98 if df.tag == 20 {
100 for sf in &df.subfields {
101 if sf.code == 'a' {
102 match parse_isbn_string(&sf.content) {
103 ParseResult::Valid(isbns, _) => {
104 for isbn in isbns {
105 if isbn.tags.len() > 0 {
106 for tag in isbn.tags {
107 self.isbns.write_object(ISBNrec {
108 rec_id,
109 isbn: isbn.text.clone(),
110 tag: Some(tag),
111 })?;
112 }
113 } else {
114 self.isbns.write_object(ISBNrec {
115 rec_id,
116 isbn: isbn.text,
117 tag: None,
118 })?;
119 }
120 }
121 }
122 ParseResult::Ignored(_) => (),
123 ParseResult::Unmatched(s) => {
124 warn!("unmatched ISBN text {}", s)
125 }
126 }
127 }
128 }
129 } else if df.tag == 100 {
130 for sf in &df.subfields {
132 if sf.code == 'a' {
133 let content = sf.content.trim();
134 let author_name = clean_name(content);
135 if !author_name.is_empty() {
136 self.authors.write_object(AuthRec {
137 rec_id,
138 author_name,
139 })?;
140 }
141 }
142 }
143 }
144 }
145
146 let ids = BookIds {
148 rec_id,
149 marc_cn: record
150 .marc_control()
151 .ok_or_else(|| anyhow!("no MARC control number"))?
152 .to_owned(),
153 lccn: record.lccn().map(|s| s.to_owned()),
154 status: record.rec_status().unwrap_or(0),
155 rec_type: record.rec_type().unwrap_or(0),
156 bib_level: record.rec_bib_level().unwrap_or(0),
157 };
158 self.ids.write_object(ids)?;
159
160 self.fields.write_object(record)?;
161 Ok(())
162 }
163
164 fn finish(self) -> Result<usize> {
165 self.fields.finish()?;
166 self.ids.finish()?;
167 self.isbns.finish()?;
168 self.authors.finish()?;
169 Ok(self.n_books as usize)
170 }
171}