bookdata/cli/bx/
extract.rs

1//! BookCrossing data extraction.
2//!
3//! The BookCrossing CSV files are corrupt, so this command extracts them and fixes
4//! up the character sets to make them well-formed CSV.
5use std::fs::File;
6use std::io::Write;
7use std::path::PathBuf;
8
9use zip::ZipArchive;
10
11use crate::prelude::*;
12
13#[derive(Args, Debug)]
14pub struct Extract {
15    /// The zip file to read.
16    #[arg(name = "ZIPFILE")]
17    zipfile: PathBuf,
18
19    /// The CSV file to write.
20    #[arg(name = "OUTFILE")]
21    outfile: PathBuf,
22}
23
24impl Command for Extract {
25    fn exec(&self) -> Result<()> {
26        info!("reading {:?}", self.zipfile);
27        let file = File::open(&self.zipfile)?;
28        let mut zip = ZipArchive::new(file)?;
29        let mut entry = zip.by_name("BX-Book-Ratings.csv")?;
30        let mut data = entry.read_all_sized()?;
31
32        info!("cleaning up data file");
33
34        debug!("removing non-ASCII characters and carriage returns");
35        data.retain(|b| *b < 128 && *b != b'\r');
36
37        debug!("replacing semicolons to make CSV");
38        // can this be done with retain_with?
39        for i in 0..data.len() {
40            let c = data[i];
41            if c == b';' {
42                data[i] = b',';
43            }
44        }
45
46        debug!("splitting CSV header");
47        let data = String::from_utf8(data)?;
48        let pos = if let Some(p) = data.find('\n') {
49            p
50        } else {
51            error!("no newline found, corrupt input data?");
52            return Err(anyhow!("corrupt data"));
53        };
54        let (hdr, rest) = data.split_at(pos + 1);
55        if !hdr.starts_with("\"User-ID\",") {
56            error!("unexpected file header found");
57            info!("found header: “{}“", hdr);
58            info!("expected cleaned header to begin with “\"User-ID\",“");
59            return Err(anyhow!("corrupt data"));
60        }
61
62        info!("writing cleaned output");
63        let mut out = File::create(&self.outfile)?;
64        write!(out, "user,isbn,rating\n")?;
65        let csvin = csv::Reader::from_reader(rest.as_bytes());
66        for row in csvin.into_records() {
67            let row = row?;
68            let user = row.get(0).ok_or(anyhow!("invalid CSV row"))?;
69            let isbn = row.get(1).ok_or(anyhow!("invalid CSV row"))?;
70            let rating = row.get(2).ok_or(anyhow!("invalid CSV row"))?;
71
72            let mut isbn = isbn.to_uppercase();
73            isbn.retain(|c| (c >= '0' && c <= '9') || c == 'X');
74
75            if isbn.len() > 0 {
76                write!(out, "{},{},{}\n", user, isbn, rating)?;
77            }
78        }
79
80        Ok(())
81    }
82}