bookdata/cli/
scan_marc.rs

1//! Scan MARC records.  See [ScanMARC] for documentation.
2use std::path::PathBuf;
3use std::time::Instant;
4
5use log::*;
6
7use clap::Args;
8use glob::glob;
9
10use crate::io::{log_file_info, open_gzin_progress};
11use crate::prelude::*;
12
13use crate::marc::book_fields::BookOutput;
14use crate::marc::flat_fields::FieldOutput;
15use crate::marc::parse::{scan_records, scan_records_delim};
16use crate::marc::MARCRecord;
17use crate::util::logging::{data_progress, item_progress};
18
19/// Scan MARC records and extract basic information.
20///
21/// This tool scans MARC-XML records, in either raw or delimited-line format,
22/// and writes the fields to a Parquet file of flat field records.  It has two
23/// modes: normal, which simply writes MARC fields to the Parquet file, and
24/// 'book mode', which only saves books and produces additional output files
25/// summarizing book record information and book ISBNs.
26#[derive(Args, Debug)]
27#[command(name = "scan-marc")]
28pub struct ScanMARC {
29    /// Output files for normal mode.
30    #[arg(short = 'o', long = "output")]
31    output: Option<PathBuf>,
32
33    /// Prefix for output files in book mode.
34    #[arg(short = 'p', long = "output-prefix")]
35    prefix: Option<String>,
36
37    /// Turn on book mode.
38    #[arg(long = "book-mode")]
39    book_mode: bool,
40
41    /// Read in line mode
42    #[arg(short = 'L', long = "line-mode")]
43    line_mode: bool,
44
45    /// Glob for files to parse.
46    #[arg(short = 'G', long = "glob")]
47    glob: Option<String>,
48
49    /// Input files to parse (GZ-compressed)
50    #[arg(name = "FILE")]
51    files: Vec<PathBuf>,
52}
53
54impl Command for ScanMARC {
55    fn exec(&self) -> Result<()> {
56        // dispatch based on our operating mode
57        if self.book_mode {
58            let pfx = match &self.prefix {
59                Some(p) => p,
60                None => "book",
61            };
62            let output = BookOutput::open(pfx)?;
63            self.process_records(output)?;
64        } else {
65            let ofn = match &self.output {
66                Some(p) => p.clone(),
67                None => PathBuf::from("marc-fields.parquet"),
68            };
69            let output = FieldOutput::open(&ofn)?;
70            self.process_records(output)?;
71        };
72
73        Ok(())
74    }
75}
76
77impl ScanMARC {
78    fn find_files(&self) -> Result<Vec<PathBuf>> {
79        if let Some(ref gs) = self.glob {
80            info!("scanning for files {}", gs);
81            let mut v = Vec::new();
82            for entry in glob(gs)? {
83                let entry = entry?;
84                v.push(entry);
85            }
86            Ok(v)
87        } else {
88            Ok(self.files.clone())
89        }
90    }
91
92    fn process_records<W: ObjectWriter<MARCRecord> + DataSink + Send + Sync + 'static>(
93        &self,
94        mut output: W,
95    ) -> Result<()> {
96        let mut nfiles = 0;
97        let mut all_recs = 0;
98        let all_start = Instant::now();
99        let files = self.find_files()?;
100        let fpb = item_progress(files.len(), "input files");
101
102        for inf in files {
103            nfiles += 1;
104            fpb.inc(1);
105            let inf = inf.as_path();
106            let file_start = Instant::now();
107            info!("reading from compressed file {}", inf.display());
108            let pb = data_progress(0);
109            let read = open_gzin_progress(inf, pb.clone())?;
110            let nrecs = if self.line_mode {
111                scan_records_delim(read, &mut output)?
112            } else {
113                scan_records(read, &mut output)?
114            };
115
116            info!(
117                "processed {} records from {} in {:.2}s",
118                nrecs,
119                inf.display(),
120                file_start.elapsed().as_secs_f32()
121            );
122            all_recs += nrecs;
123        }
124        fpb.finish_and_clear();
125
126        let outs = output.output_files();
127        let written = output.finish()?;
128
129        info!(
130            "imported {} fields from {} records from {} files in {:.2}s",
131            written,
132            all_recs,
133            nfiles,
134            all_start.elapsed().as_secs_f32()
135        );
136        log_file_info(&outs)?;
137
138        Ok(())
139    }
140}