bookdata/ids/
index.rs

1//! Data structure for mapping string keys to numeric identifiers.
2use hashbrown::hash_map::{HashMap, Keys};
3use std::borrow::Borrow;
4use std::fs::File;
5use std::hash::Hash;
6use std::path::Path;
7
8use anyhow::Result;
9use log::*;
10use polars::prelude::*;
11use serde::de::DeserializeOwned;
12use thiserror::Error;
13
14#[cfg(test)]
15use quickcheck::{Arbitrary, Gen};
16#[cfg(test)]
17use tempfile::tempdir;
18
19/// The type of index identifiers.
20pub type Id = i32;
21
22#[derive(Error, Debug)]
23pub enum IndexError {
24    #[error("key not present in frozen index")]
25    KeyNotPresent,
26}
27
28/// Index identifiers from a data type
29pub struct IdIndex<K> {
30    map: HashMap<K, Id>,
31    frozen: bool,
32}
33
34impl<K> IdIndex<K>
35where
36    K: Eq + Hash,
37{
38    /// Create a new index.
39    pub fn new() -> IdIndex<K> {
40        IdIndex {
41            map: HashMap::new(),
42            frozen: false,
43        }
44    }
45
46    /// Freeze the index so no new items can be added.
47    #[allow(dead_code)]
48    pub fn freeze(self) -> IdIndex<K> {
49        IdIndex {
50            map: self.map,
51            frozen: true,
52        }
53    }
54
55    /// Get the index length
56    pub fn len(&self) -> usize {
57        self.map.len()
58    }
59
60    /// Get the ID for a key, adding it to the index if needed.
61    pub fn intern<Q>(&mut self, key: &Q) -> Result<Id, IndexError>
62    where
63        K: Borrow<Q>,
64        Q: Hash + Eq + ToOwned<Owned = K> + ?Sized,
65    {
66        let n = self.map.len() as Id;
67        if self.frozen {
68            self.lookup(key).ok_or(IndexError::KeyNotPresent)
69        } else {
70            // use Hashbrown's raw-entry API to minimize cloning
71            let eb = self.map.raw_entry_mut();
72            let e = eb.from_key(key);
73            let (_, v) = e.or_insert_with(|| (key.to_owned(), n + 1));
74            Ok(*v)
75        }
76    }
77
78    /// Get the ID for a key, adding it to the index if needed and transferring ownership.
79    pub fn intern_owned(&mut self, key: K) -> Result<Id, IndexError> {
80        let n = self.map.len() as Id;
81        if self.frozen {
82            self.lookup(&key).ok_or(IndexError::KeyNotPresent)
83        } else {
84            Ok(*self.map.entry(key).or_insert(n + 1))
85        }
86    }
87
88    /// Look up the ID for a key if it is present.
89    #[allow(dead_code)]
90    pub fn lookup<Q>(&self, key: &Q) -> Option<Id>
91    where
92        K: Borrow<Q>,
93        Q: Hash + Eq + ?Sized,
94    {
95        self.map.get(key).map(|i| *i)
96    }
97
98    /// Iterate over keys (see [std::collections::HashMap::keys]).
99    #[allow(dead_code)]
100    pub fn keys(&self) -> Keys<'_, K, Id> {
101        self.map.keys()
102    }
103}
104
105impl IdIndex<String> {
106    /// Get the keys in order.
107    pub fn key_vec(&self) -> Vec<&str> {
108        let mut vec = Vec::with_capacity(self.len());
109        vec.resize(self.len(), None);
110        for (k, n) in self.map.iter() {
111            let i = (n - 1) as usize;
112            assert!(vec[i].is_none());
113            vec[i] = Some(k);
114        }
115
116        let vec = vec.iter().map(|ro| ro.unwrap().as_str()).collect();
117        vec
118    }
119
120    /// Conver this ID index into a [DataFrame], with columns for ID and key.
121    pub fn data_frame(&self, id_col: &str, key_col: &str) -> Result<DataFrame, PolarsError> {
122        debug!("preparing data frame for index");
123        let n = self.map.len() as i32;
124        let keys = self.key_vec();
125        let ids = Int32Chunked::new(id_col, 1..(n + 1));
126        let keys = StringChunked::new(key_col, keys);
127
128        DataFrame::new(vec![ids.into_series(), keys.into_series()])
129    }
130
131    /// Load from a Parquet file, with a standard configuration.
132    ///
133    /// This assumes the Parquet file has the following columns:
134    ///
135    /// - `key`, of type `String`, storing the keys
136    /// - `id`, of type `i32`, storing the IDs
137    pub fn load_standard<P: AsRef<Path>>(path: P) -> Result<IdIndex<String>> {
138        IdIndex::load(path, "id", "key")
139    }
140
141    /// Load from a Parquet file.
142    ///
143    /// This loads two columns from a Parquet file.  The ID column is expected to
144    /// have type `UInt32` (or a type projectable to it), and the key column should
145    /// be `Utf8`.
146    pub fn load<P: AsRef<Path>>(path: P, id_col: &str, key_col: &str) -> Result<IdIndex<String>> {
147        let path_str = path.as_ref().to_string_lossy();
148        info!("reading index from file {}", path_str);
149        let file = File::open(path.as_ref())?;
150        let frame = ParquetReader::new(file).finish()?;
151        debug!("file schema: {:?}", frame.schema());
152
153        let ic = frame.column(id_col)?.i32()?;
154        let kc = frame.column(key_col)?.str()?;
155
156        let mut map = HashMap::new();
157
158        debug!("reading file contents");
159        let iter = ic.into_iter().zip(kc.into_iter());
160        for pair in iter {
161            if let (Some(id), Some(key)) = pair {
162                map.insert(key.to_string(), id);
163            }
164        }
165
166        info!("read {} keys from {}", map.len(), path_str);
167
168        Ok(IdIndex { map, frozen: false })
169    }
170
171    /// Load an index from a CSV file.
172    ///
173    /// This loads an index from a CSV file.  It assumes the first column is the ID, and the
174    /// second column is the key.
175    #[allow(dead_code)]
176    pub fn load_csv<P: AsRef<Path>, K: Eq + Hash + DeserializeOwned>(
177        path: P,
178    ) -> Result<IdIndex<K>> {
179        info!("reading ID index from from {:?}", path.as_ref());
180        let input = csv::Reader::from_path(path)?;
181        let recs = input.into_deserialize();
182
183        let mut map = HashMap::new();
184
185        for row in recs {
186            let rec: (i32, K) = row?;
187            let (id, key) = rec;
188            map.insert(key, id);
189        }
190
191        Ok(IdIndex { map, frozen: false })
192    }
193
194    /// Save to a Parquet file with the standard configuration.
195    pub fn save_standard<P: AsRef<Path>>(&self, path: P) -> Result<()> {
196        self.save(path, "id", "key")
197    }
198
199    /// Save to a Parquet file with the standard configuration.
200    pub fn save<P: AsRef<Path>>(&self, path: P, id_col: &str, key_col: &str) -> Result<()> {
201        let mut frame = self.data_frame(id_col, key_col)?;
202
203        let path = path.as_ref();
204        info!("saving index to {:?}", path);
205        let file = File::create(path)?;
206        let writer = ParquetWriter::new(file).with_compression(ParquetCompression::Zstd(None));
207        writer.finish(&mut frame)?;
208
209        Ok(())
210    }
211}
212
213#[test]
214fn test_index_empty() {
215    let index: IdIndex<String> = IdIndex::new();
216    assert_eq!(index.len(), 0);
217    assert!(index.lookup("bob").is_none());
218}
219
220#[test]
221fn test_index_intern_one() {
222    let mut index: IdIndex<String> = IdIndex::new();
223    assert!(index.lookup("hackem muche").is_none());
224    let id = index.intern("hackem muche").expect("intern failure");
225    assert_eq!(id, 1);
226    assert_eq!(index.lookup("hackem muche").unwrap(), 1);
227}
228
229#[test]
230fn test_index_intern_two() {
231    let mut index: IdIndex<String> = IdIndex::new();
232    assert!(index.lookup("hackem muche").is_none());
233    let id = index.intern("hackem muche");
234    assert_eq!(id.expect("intern failure"), 1);
235    let id2 = index.intern("readme");
236    assert_eq!(id2.expect("intern failure"), 2);
237    assert_eq!(index.lookup("hackem muche").unwrap(), 1);
238}
239
240#[test]
241fn test_index_intern_twice() {
242    let mut index: IdIndex<String> = IdIndex::new();
243    assert!(index.lookup("hackem muche").is_none());
244    let id = index.intern("hackem muche");
245    assert_eq!(id.expect("intern failure"), 1);
246    let id2 = index.intern("hackem muche");
247    assert_eq!(id2.expect("intern failure"), 1);
248    assert_eq!(index.len(), 1);
249}
250
251#[test]
252fn test_index_intern_twice_owned() {
253    let mut index: IdIndex<String> = IdIndex::new();
254    assert!(index.lookup("hackem muche").is_none());
255    let id = index.intern_owned("hackem muche".to_owned());
256    assert!(id.is_ok());
257    assert_eq!(id.expect("intern failure"), 1);
258    let id2 = index.intern_owned("hackem muche".to_owned());
259    assert!(id2.is_ok());
260    assert_eq!(id2.expect("intern failure"), 1);
261    assert_eq!(index.len(), 1);
262}
263
264#[cfg(test)]
265#[test_log::test]
266fn test_index_save() -> Result<()> {
267    let mut index: IdIndex<String> = IdIndex::new();
268    let mut gen = Gen::new(100);
269    for _i in 0..10000 {
270        let key = String::arbitrary(&mut gen);
271        let prev = index.lookup(&key);
272        let id = index.intern(&key).expect("intern failure");
273        match prev {
274            Some(i) => assert_eq!(id, i),
275            None => assert_eq!(id as usize, index.len()),
276        };
277    }
278
279    let dir = tempdir()?;
280    let pq = dir.path().join("index.parquet");
281    index.save_standard(&pq).expect("save error");
282
283    let i2 = IdIndex::load_standard(&pq).expect("load error");
284    assert_eq!(i2.len(), index.len());
285    for (k, v) in &index.map {
286        let v2 = i2.lookup(k);
287        assert!(v2.is_some());
288        assert_eq!(v2.unwrap(), *v);
289    }
290
291    Ok(())
292}
293
294#[test]
295fn test_index_freeze() {
296    let mut index: IdIndex<String> = IdIndex::new();
297    assert!(index.lookup("hackem muche").is_none());
298    let id = index.intern("hackem muche");
299    assert!(id.is_ok());
300    assert_eq!(id.expect("intern failure"), 1);
301
302    let mut index = index.freeze();
303
304    let id = index.intern("hackem muche");
305    assert!(id.is_ok());
306    assert_eq!(id.expect("intern failure"), 1);
307
308    let id2 = index.intern("foobie bletch");
309    assert!(id2.is_err());
310}