bookdata/cli/goodreads/
work_gender.rs

1use crate::{ids::codes::NS_GR_WORK, prelude::*};
2use polars::prelude::*;
3
4pub fn link_work_genders() -> Result<()> {
5    require_working_dir("goodreads")?;
6
7    let gender = LazyFrame::scan_parquet("../book-links/cluster-genders.parquet", default())?;
8    let books = LazyFrame::scan_parquet("gr-book-link.parquet", default())?;
9
10    let merged = gender.join(
11        books,
12        &[col("cluster")],
13        &[col("cluster")],
14        JoinType::Inner.into(),
15    );
16    let dedup = merged.unique(None, UniqueKeepStrategy::First);
17    let dedup = dedup.select([
18        col("*"),
19        coalesce(&[
20            col("work_id") + lit(NS_GR_WORK.base()),
21            col("book_id") + lit(NS_GR_WORK.base()),
22        ])
23        .alias("item_id"),
24    ]);
25
26    info!("computing book genders");
27    let results = dedup.clone().collect()?;
28
29    info!("saving {} book-gender records", results.height());
30    save_df_parquet(results, "gr-book-gender.parquet")?;
31
32    info!("computing item genders");
33    let dd2 = dedup
34        .select(&[col("item_id"), col("gender")])
35        .unique(Some(vec!["item_id".into()]), UniqueKeepStrategy::First);
36    let results = dd2.collect()?;
37
38    info!("saving {} item-gender records", results.height());
39    save_df_parquet(results, "gr-work-item-gender.parquet")?;
40
41    Ok(())
42}