bookdata/marc/
book_fields.rs1use parquet_derive::ParquetRecordWriter;
3use serde::Serialize;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::{parse_isbn_string, ParseResult};
7use crate::cleaning::names::clean_name;
8use crate::marc::flat_fields::FieldOutput;
9use crate::marc::MARCRecord;
10use crate::prelude::*;
11
12#[derive(ParquetRecordWriter, Debug)]
14struct BookIds {
15 rec_id: u32,
16 marc_cn: String,
17 lccn: Option<String>,
18 status: u8,
19 rec_type: u8,
20 bib_level: u8,
21}
22
23#[derive(Serialize, ParquetRecordWriter, Debug)]
25struct ISBNrec {
26 rec_id: u32,
27 isbn: String,
28 tag: Option<String>,
29}
30
31#[derive(Serialize, ParquetRecordWriter, Debug)]
33struct AuthRec {
34 rec_id: u32,
35 author_name: String,
36}
37
38pub struct BookOutput {
40 n_books: u32,
41 prefix: String,
42 fields: FieldOutput,
43 ids: TableWriter<BookIds>,
44 isbns: TableWriter<ISBNrec>,
45 authors: TableWriter<AuthRec>,
46}
47
48impl BookOutput {
49 pub fn open(prefix: &str) -> Result<BookOutput> {
50 let ffn = format!("{}-fields.parquet", prefix);
51 info!("writing book fields to {}", ffn);
52 let fields = TableWriter::open(ffn)?;
53 let fields = FieldOutput::new(fields);
54
55 let idfn = format!("{}-ids.parquet", prefix);
56 info!("writing book IDs to {}", idfn);
57 let ids = TableWriter::open(idfn)?;
58
59 let isbnfn = format!("{}-isbns.parquet", prefix);
60 info!("writing book IDs to {}", isbnfn);
61 let isbns = TableWriter::open(isbnfn)?;
62
63 let authfn = format!("{}-authors.parquet", prefix);
64 info!("writing book authors to {}", authfn);
65 let authors = TableWriter::open(authfn)?;
66
67 Ok(BookOutput {
68 n_books: 0,
69 prefix: prefix.to_string(),
70 fields,
71 ids,
72 isbns,
73 authors,
74 })
75 }
76}
77
78impl ObjectWriter<MARCRecord> for BookOutput {
79 fn write_object(&mut self, record: MARCRecord) -> Result<()> {
80 if !record.is_book() {
81 return Ok(());
82 }
83 self.n_books += 1;
84 let rec_id = self.n_books;
85
86 for df in &record.fields {
88 if df.tag == 20 {
90 for sf in &df.subfields {
91 if sf.code == 'a' {
92 match parse_isbn_string(&sf.content) {
93 ParseResult::Valid(isbns, _) => {
94 for isbn in isbns {
95 if isbn.tags.len() > 0 {
96 for tag in isbn.tags {
97 self.isbns.write_object(ISBNrec {
98 rec_id,
99 isbn: isbn.text.clone(),
100 tag: Some(tag),
101 })?;
102 }
103 } else {
104 self.isbns.write_object(ISBNrec {
105 rec_id,
106 isbn: isbn.text,
107 tag: None,
108 })?;
109 }
110 }
111 }
112 ParseResult::Ignored(_) => (),
113 ParseResult::Unmatched(s) => {
114 warn!("unmatched ISBN text {}", s)
115 }
116 }
117 }
118 }
119 } else if df.tag == 100 {
120 for sf in &df.subfields {
122 if sf.code == 'a' {
123 let content = sf.content.trim();
124 let author_name = clean_name(content);
125 if !author_name.is_empty() {
126 self.authors.write_object(AuthRec {
127 rec_id,
128 author_name,
129 })?;
130 }
131 }
132 }
133 }
134 }
135
136 let ids = BookIds {
138 rec_id,
139 marc_cn: record
140 .marc_control()
141 .ok_or_else(|| anyhow!("no MARC control number"))?
142 .to_owned(),
143 lccn: record.lccn().map(|s| s.to_owned()),
144 status: record.rec_status().unwrap_or(0),
145 rec_type: record.rec_type().unwrap_or(0),
146 bib_level: record.rec_bib_level().unwrap_or(0),
147 };
148 self.ids.write_object(ids)?;
149
150 self.fields.write_object(record)?;
151 Ok(())
152 }
153
154 fn finish_objects(self) -> Result<usize> {
155 self.fields.finish_objects()?;
156 self.ids.finish_objects()?;
157 self.isbns.finish_objects()?;
158 self.authors.finish_objects()?;
159 Ok(self.n_books as usize)
160 }
161}