bookdata/cli/goodreads/
work_gender.rs1use crate::{ids::codes::NS_GR_WORK, prelude::*};
2use polars::prelude::*;
3
4pub fn link_work_genders() -> Result<()> {
5 require_working_dir("goodreads")?;
6
7 let gender = LazyFrame::scan_parquet("../book-links/cluster-genders.parquet", default())?;
8 let books = LazyFrame::scan_parquet("gr-book-link.parquet", default())?;
9
10 let merged = gender.join(
11 books,
12 &[col("cluster")],
13 &[col("cluster")],
14 JoinType::Inner.into(),
15 );
16 let dedup = merged.unique(None, UniqueKeepStrategy::First);
17 let dedup = dedup.select([
18 col("*"),
19 coalesce(&[
20 col("work_id") + lit(NS_GR_WORK.base()),
21 col("book_id") + lit(NS_GR_WORK.base()),
22 ])
23 .alias("item_id"),
24 ]);
25
26 info!("computing book genders");
27 let results = dedup.clone().collect()?;
28
29 info!("saving {} book-gender records", results.height());
30 save_df_parquet(results, "gr-book-gender.parquet")?;
31
32 info!("computing item genders");
33 let dd2 = dedup
34 .select(&[col("item_id"), col("gender")])
35 .unique(Some(vec!["item_id".into()]), UniqueKeepStrategy::First);
36 let results = dd2.collect()?;
37
38 info!("saving {} item-gender records", results.height());
39 save_df_parquet(results, "gr-work-item-gender.parquet")?;
40
41 Ok(())
42}