bookdata/cli/bx/
cluster.rs1use std::path::PathBuf;
3
4use crate::prelude::*;
5use polars::prelude::*;
6
7#[derive(Args, Debug)]
8#[command(name = "cluster-actions")]
9pub struct Cluster {
10 #[arg(long = "ratings")]
12 ratings: bool,
13
14 #[arg(long = "add-actions")]
16 add_actions: bool,
17
18 #[arg(short = 'o', long = "output", name = "FILE")]
20 outfile: PathBuf,
21}
22
23impl Command for Cluster {
24 fn exec(&self) -> Result<()> {
25 if !self.ratings && !self.add_actions {
26 error!("one of --ratings or --add-actions must be specified");
27 return Err(anyhow!("no mode specified"));
28 }
29 require_working_dir("bx")?;
30
31 let isbns = LazyFrame::scan_parquet("../book-links/isbn-clusters.parquet", default())?;
32 let isbns = isbns.select(&[col("isbn"), col("cluster")]);
33
34 let ratings = LazyCsvReader::new("cleaned-ratings.csv")
35 .has_header(true)
36 .finish()?;
37 let ratings = if self.ratings {
38 ratings.filter(col("rating").gt(0))
39 } else {
40 ratings
41 };
42 let joined = ratings.join(
43 isbns,
44 &[col("isbn")],
45 &[col("isbn")],
46 JoinType::Inner.into(),
47 );
48 let grouped = joined.group_by(&[
49 col("user").alias("user_id"),
50 col("cluster").alias("item_id"),
51 ]);
52 let agg = if self.ratings {
53 grouped.agg(&[
54 col("rating").median().alias("rating"),
55 col("cluster").count().alias("nratings"),
56 ])
57 } else {
58 grouped.agg(&[col("cluster").count().alias("nactions")])
59 };
60
61 info!("collecting results");
62 let results = agg.collect()?;
63
64 info!("writing to {:?}", &self.outfile);
65 save_df_parquet(results, &self.outfile)?;
66
67 Ok(())
68 }
69}