bookdata/cli/
scan_marc.rs1use std::path::PathBuf;
3use std::time::Instant;
4
5use log::*;
6
7use clap::Args;
8use glob::glob;
9
10use crate::io::{log_file_info, open_gzin_progress};
11use crate::prelude::*;
12
13use crate::marc::book_fields::BookOutput;
14use crate::marc::flat_fields::FieldOutput;
15use crate::marc::parse::{scan_records, scan_records_delim};
16use crate::marc::MARCRecord;
17use crate::util::logging::{data_progress, item_progress};
18
19#[derive(Args, Debug)]
27#[command(name = "scan-marc")]
28pub struct ScanMARC {
29 #[arg(short = 'o', long = "output")]
31 output: Option<PathBuf>,
32
33 #[arg(short = 'p', long = "output-prefix")]
35 prefix: Option<String>,
36
37 #[arg(long = "book-mode")]
39 book_mode: bool,
40
41 #[arg(short = 'L', long = "line-mode")]
43 line_mode: bool,
44
45 #[arg(short = 'G', long = "glob")]
47 glob: Option<String>,
48
49 #[arg(name = "FILE")]
51 files: Vec<PathBuf>,
52}
53
54impl Command for ScanMARC {
55 fn exec(&self) -> Result<()> {
56 if self.book_mode {
58 let pfx = match &self.prefix {
59 Some(p) => p,
60 None => "book",
61 };
62 let output = BookOutput::open(pfx)?;
63 self.process_records(output)?;
64 } else {
65 let ofn = match &self.output {
66 Some(p) => p.clone(),
67 None => PathBuf::from("marc-fields.parquet"),
68 };
69 let output = FieldOutput::open(&ofn)?;
70 self.process_records(output)?;
71 };
72
73 Ok(())
74 }
75}
76
77impl ScanMARC {
78 fn find_files(&self) -> Result<Vec<PathBuf>> {
79 if let Some(ref gs) = self.glob {
80 info!("scanning for files {}", gs);
81 let mut v = Vec::new();
82 for entry in glob(gs)? {
83 let entry = entry?;
84 v.push(entry);
85 }
86 Ok(v)
87 } else {
88 Ok(self.files.clone())
89 }
90 }
91
92 fn process_records<W: ObjectWriter<MARCRecord> + DataSink + Send + Sync + 'static>(
93 &self,
94 mut output: W,
95 ) -> Result<()> {
96 let mut nfiles = 0;
97 let mut all_recs = 0;
98 let all_start = Instant::now();
99 let files = self.find_files()?;
100 let fpb = item_progress(files.len(), "input files");
101
102 for inf in files {
103 nfiles += 1;
104 fpb.inc(1);
105 let inf = inf.as_path();
106 let file_start = Instant::now();
107 info!("reading from compressed file {}", inf.display());
108 let pb = data_progress(0);
109 let read = open_gzin_progress(inf, pb.clone())?;
110 let nrecs = if self.line_mode {
111 scan_records_delim(read, &mut output)?
112 } else {
113 scan_records(read, &mut output)?
114 };
115
116 info!(
117 "processed {} records from {} in {:.2}s",
118 nrecs,
119 inf.display(),
120 file_start.elapsed().as_secs_f32()
121 );
122 all_recs += nrecs;
123 }
124 fpb.finish_and_clear();
125
126 let outs = output.output_files();
127 let written = output.finish()?;
128
129 info!(
130 "imported {} fields from {} records from {} files in {:.2}s",
131 written,
132 all_recs,
133 nfiles,
134 all_start.elapsed().as_secs_f32()
135 );
136 log_file_info(&outs)?;
137
138 Ok(())
139 }
140}