import pandas as pd
import matplotlib.pyplot as plt
from duckdb import connect
ISBN Cluster Changes
This notebook audits for significant changes in the clustering results in the book data, to allow us to detect the significance of shifts from version to version. It depends on the aligned cluster identities in isbn-version-clusters.parquet
.
Data versions are indexed by month; versions corresponding to tagged versions also have the version in their name.
We are particularly intersted in the shift in number of clusters, and shifts in which cluster an ISBN is associated with (while cluster IDs are not stable across versions, this notebook works on an aligned version of the cluster-ISBN associations).
Load Data
Load the aligned ISBNs:
= connect()
db "CREATE VIEW clusters AS FROM 'isbn-version-clusters.parquet'")
db.execute('clusters').show(max_rows=5) db.table(
┌───────────────┬──────────┬───────────┬─────────────┬───────────┬─────────────┬───────────┬───────────┬─────────────┬───────────┐
│ isbn │ isbn_id │ current │ 2025-06-3.0 │ 2023-07 │ 2022-11-2.1 │ 2022-10 │ 2022-07 │ 2022-03-2.0 │ pgsql │
│ varchar │ int32 │ int32 │ int32 │ int32 │ int32 │ int32 │ int32 │ int32 │ int32 │
├───────────────┼──────────┼───────────┼─────────────┼───────────┼─────────────┼───────────┼───────────┼─────────────┼───────────┤
│ 9788439382553 │ 40125156 │ 135041912 │ 135041912 │ 135041912 │ NULL │ NULL │ NULL │ NULL │ NULL │
│ 0861082656 │ 21664893 │ 119305977 │ 119305977 │ 119305977 │ 119305977 │ 119305977 │ 119305977 │ 119305977 │ 119305977 │
│ 9783319295428 │ 30718436 │ 120841856 │ 120841856 │ 120841856 │ 120841856 │ 120841856 │ 120841856 │ NULL │ NULL │
│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │
│ · │ · │ · │ · │ · │ · │ · │ · │ · │ · │
│ 8484721337 │ 3447941 │ 134981118 │ 134981118 │ 134981118 │ NULL │ NULL │ NULL │ NULL │ NULL │
│ 9798437649299 │ 22897881 │ 100066513 │ 100066513 │ 100066513 │ NULL │ NULL │ NULL │ NULL │ NULL │
├───────────────┴──────────┴───────────┴─────────────┴───────────┴─────────────┴───────────┴───────────┴─────────────┴───────────┤
│ ? rows (>9999 rows, 5 shown) 10 columns │
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
db.execute("""
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'clusters'
AND NOT column_name LIKE 'isbn%'
ORDER BY column_name
"""
)= [r[0] for r in db.fetchall()]
versions 'pgsql')
versions.remove(0, 'pgsql')
versions.insert( versions
['pgsql',
'2022-03-2.0',
'2022-07',
'2022-10',
'2022-11-2.1',
'2023-07',
'2025-06-3.0',
'current']
db.execute("""
CREATE VIEW cluster_tall AS
UNPIVOT clusters
ON COLUMNS(* EXCLUDE (isbn, isbn_id))
INTO NAME version VALUE cluster
"""
)
<duckdb.duckdb.DuckDBPyConnection at 0x7623584c4370>
For further analysis, we want to compute the size of each cluster.
db.execute("""
CREATE TABLE cluster_sizes AS
SELECT version, cluster, COUNT(isbn) AS size
FROM cluster_tall
WHERE cluster IS NOT NULL
GROUP BY version, cluster
"""
)
<duckdb.duckdb.DuckDBPyConnection at 0x7623584c4370>
Cluster Counts
Let’s look at the # of ISBNs and clusters in each dataset.
= db.query(
metrics """
SELECT version,
SUM(size) AS n_isbns,
COUNT(cluster) AS n_clusters,
MAX(size) AS max_size
FROM cluster_sizes
GROUP BY version
"""
) metrics
┌─────────────┬──────────┬────────────┬──────────┐
│ version │ n_isbns │ n_clusters │ max_size │
│ varchar │ int128 │ int64 │ int64 │
├─────────────┼──────────┼────────────┼──────────┤
│ 2022-10 │ 32715078 │ 16882949 │ 14378 │
│ 2022-03-2.0 │ 24503563 │ 13201869 │ 7976 │
│ 2023-07 │ 42979427 │ 23191293 │ 47857 │
│ 2022-11-2.1 │ 33505211 │ 17673075 │ 14378 │
│ pgsql │ 24482342 │ 13213677 │ 7818 │
│ current │ 44505174 │ 24107169 │ 51481 │
│ 2025-06-3.0 │ 44505174 │ 24107169 │ 51481 │
│ 2022-07 │ 32715079 │ 17078096 │ 13988 │
└─────────────┴──────────┴────────────┴──────────┘
Cluster Size Distributions
Now we’re going to look at how the sizes of clusters, and the distribution of cluster sizes and changes.
Compute the histogram:
= db.query(
size_hist """
SELECT version, size, COUNT(cluster) AS count
FROM cluster_sizes
GROUP BY version, size
ORDER BY version, size
"""
).to_df()= size_hist.set_index(['version', 'size'])['count'] size_hist
And plot the cumulative distributions:
for v in versions:
= size_hist.loc[v].sort_index()
vss = vss.cumsum() / vss.sum()
vsc =v)
plt.plot(vsc.index, vsc.values, label
'Distribution of Cluster Sizes')
plt.title('Cum. Frac. of Clusters')
plt.ylabel('Cluster Size')
plt.xlabel('symlog')
plt.xscale(
plt.legend() plt.show()
Different Clusters
ISBN Changes
How many ISBNs changed cluster across each version?
= db.query(
isbn_clusters """
SELECT * EXCLUDE isbn
FROM clusters
"""
).df()= ['same', 'added', 'changed', 'dropped']
statuses = isbn_clusters[['isbn_id']].copy(deep=False)
changed for (v1, v2) in zip(versions, versions[1:]):
= isbn_clusters[v1]
v1c = isbn_clusters[v2]
v2c = pd.Series('same', index=changed.index)
cc = cc.astype('category').cat.set_categories(statuses, ordered=True)
cc & v2c.notnull()] = 'added'
cc[v1c.isnull() & v2c.isnull()] = 'dropped'
cc[v1c.notnull() & v2c.notnull() & (v1c != v2c)] = 'changed'
cc[v1c.notnull() = cc
changed[v2] del cc
'isbn_id', inplace=True)
changed.set_index( changed.head()
2022-03-2.0 | 2022-07 | 2022-10 | 2022-11-2.1 | 2023-07 | 2025-06-3.0 | current | |
---|---|---|---|---|---|---|---|
isbn_id | |||||||
40125156 | same | same | same | same | added | same | same |
21664893 | same | same | same | same | same | same | same |
30718436 | same | added | same | same | same | same | same |
21921964 | same | same | same | same | same | same | same |
17931298 | same | same | same | same | same | same | same |
Count number in each trajectory:
= changed.value_counts()
trajectories = trajectories.to_frame('count')
trajectories 'fraction'] = trajectories['count'] / len(changed)
trajectories['cum_frac'] = trajectories['fraction'].cumsum() trajectories[
trajectories
count | fraction | cum_frac | |||||||
---|---|---|---|---|---|---|---|---|---|
2022-03-2.0 | 2022-07 | 2022-10 | 2022-11-2.1 | 2023-07 | 2025-06-3.0 | current | |||
same | same | same | same | same | same | same | 23791333 | 5.302665e-01 | 0.530267 |
added | same | same | 9415058 | 2.098449e-01 | 0.740111 | ||||
added | same | same | same | same | same | 7732266 | 1.723385e-01 | 0.912450 | |
same | same | same | same | added | same | 1839384 | 4.099660e-02 | 0.953447 | |
added | same | same | same | 784380 | 1.748244e-02 | 0.970929 | |||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
added | dropped | same | same | added | changed | same | 1 | 2.228822e-08 | 1.000000 |
changed | same | same | dropped | same | same | same | 1 | 2.228822e-08 | 1.000000 |
added | dropped | same | added | same | same | same | 1 | 2.228822e-08 | 1.000000 |
changed | changed | same | dropped | added | same | same | 1 | 2.228822e-08 | 1.000000 |
dropped | same | same | added | changed | same | same | 1 | 2.228822e-08 | 1.000000 |
101 rows × 3 columns
= metrics.df().set_index('version')
metric_df = metric_df.reindex(versions)
metric_df 'new_isbns'] = (changed[versions[1:]] == 'added').sum().reindex(metric_df.index)
metric_df['dropped_isbns'] = (changed[versions[1:]] == 'dropped').sum().reindex(metric_df.index)
metric_df['changed_isbns'] = (changed[versions[1:]] == 'changed').sum().reindex(metric_df.index)
metric_df[ metric_df
n_isbns | n_clusters | max_size | new_isbns | dropped_isbns | changed_isbns | |
---|---|---|---|---|---|---|
version | ||||||
pgsql | 24482342.0 | 13213677 | 7818 | NaN | NaN | NaN |
2022-03-2.0 | 24503563.0 | 13201869 | 7976 | 88086.0 | 66865.0 | 45946.0 |
2022-07 | 32715079.0 | 17078096 | 13988 | 8220673.0 | 9157.0 | 228979.0 |
2022-10 | 32715078.0 | 16882949 | 14378 | 0.0 | 1.0 | 195258.0 |
2022-11-2.1 | 33505211.0 | 17673075 | 14378 | 790244.0 | 111.0 | 7.0 |
2023-07 | 42979427.0 | 23191293 | 47857 | 9489505.0 | 15289.0 | 218442.0 |
2025-06-3.0 | 44505174.0 | 24107169 | 51481 | 1839535.0 | 313788.0 | 140075.0 |
current | 44505174.0 | 24107169 | 51481 | 0.0 | 0.0 | 0.0 |
The biggest change is that the July 2022 update introduced a large number (8.2M) of new ISBNs, with more new ISBns in later updates. July 2022 incorporated more current book data, and changed the ISBN parsing logic, so it is not surprising.
Let’s save these book changes to a file for future re-analysis:
'isbn-cluster-changes.parquet', compression='zstd') changed.to_parquet(
Final Saved Metrics
Now we’re going to save this metric file to a CSV.
'audit-metrics.csv') metrics.to_csv(