bookdata/cli/bx/
cluster.rs1use std::path::PathBuf;
3
4use crate::prelude::*;
5use polars::prelude::*;
6
7#[derive(Args, Debug)]
8#[command(name = "cluster-actions")]
9pub struct Cluster {
10 #[arg(long = "ratings")]
12 ratings: bool,
13
14 #[arg(long = "add-actions")]
16 add_actions: bool,
17
18 #[arg(short = 'o', long = "output", name = "FILE")]
20 outfile: PathBuf,
21}
22
23impl Command for Cluster {
24 fn exec(&self) -> Result<()> {
25 if !self.ratings && !self.add_actions {
26 error!("one of --ratings or --add-actions must be specified");
27 return Err(anyhow!("no mode specified"));
28 }
29 require_working_dir("bx")?;
30
31 let isbns = LazyFrame::scan_parquet("../book-links/isbn-clusters.parquet", default())?;
32 let isbns = isbns.select(&[col("isbn"), col("cluster")]);
33
34 let ratings = LazyCsvReader::new("cleaned-ratings.csv")
35 .has_header(true)
36 .finish()?;
37 let ratings = if self.ratings {
38 ratings.filter(col("rating").gt(0))
39 } else {
40 ratings
41 };
42 let joined = ratings.join(
43 isbns,
44 &[col("isbn")],
45 &[col("isbn")],
46 JoinType::Inner.into(),
47 );
48 let grouped = joined.group_by(&[col("user_id"), col("cluster").alias("item_id")]);
49 let agg = if self.ratings {
50 grouped.agg(&[
51 col("rating").median().alias("rating"),
52 col("cluster").count().alias("nratings"),
53 ])
54 } else {
55 grouped.agg(&[col("cluster").count().alias("nactions")])
56 };
57
58 info!("collecting results");
59 let results = agg.collect()?;
60
61 info!("writing to {:?}", &self.outfile);
62 save_df_parquet(results, &self.outfile)?;
63
64 Ok(())
65 }
66}