1use hashbrown::hash_map::{HashMap, Keys};
3use std::borrow::Borrow;
4use std::fs::File;
5use std::hash::Hash;
6use std::path::Path;
7
8use anyhow::Result;
9use log::*;
10use polars::prelude::*;
11use serde::de::DeserializeOwned;
12use thiserror::Error;
13
14#[cfg(test)]
15use quickcheck::{Arbitrary, Gen};
16#[cfg(test)]
17use tempfile::tempdir;
18
19pub type Id = i32;
21
22#[derive(Error, Debug)]
23pub enum IndexError {
24 #[error("key not present in frozen index")]
25 KeyNotPresent,
26}
27
28pub struct IdIndex<K> {
30 map: HashMap<K, Id>,
31 frozen: bool,
32}
33
34impl<K> IdIndex<K>
35where
36 K: Eq + Hash,
37{
38 pub fn new() -> IdIndex<K> {
40 IdIndex {
41 map: HashMap::new(),
42 frozen: false,
43 }
44 }
45
46 #[allow(dead_code)]
48 pub fn freeze(self) -> IdIndex<K> {
49 IdIndex {
50 map: self.map,
51 frozen: true,
52 }
53 }
54
55 pub fn len(&self) -> usize {
57 self.map.len()
58 }
59
60 pub fn intern<Q>(&mut self, key: &Q) -> Result<Id, IndexError>
62 where
63 K: Borrow<Q>,
64 Q: Hash + Eq + ToOwned<Owned = K> + ?Sized,
65 {
66 let n = self.map.len() as Id;
67 if self.frozen {
68 self.lookup(key).ok_or(IndexError::KeyNotPresent)
69 } else {
70 let eb = self.map.raw_entry_mut();
72 let e = eb.from_key(key);
73 let (_, v) = e.or_insert_with(|| (key.to_owned(), n + 1));
74 Ok(*v)
75 }
76 }
77
78 pub fn intern_owned(&mut self, key: K) -> Result<Id, IndexError> {
80 let n = self.map.len() as Id;
81 if self.frozen {
82 self.lookup(&key).ok_or(IndexError::KeyNotPresent)
83 } else {
84 Ok(*self.map.entry(key).or_insert(n + 1))
85 }
86 }
87
88 #[allow(dead_code)]
90 pub fn lookup<Q>(&self, key: &Q) -> Option<Id>
91 where
92 K: Borrow<Q>,
93 Q: Hash + Eq + ?Sized,
94 {
95 self.map.get(key).map(|i| *i)
96 }
97
98 #[allow(dead_code)]
100 pub fn keys(&self) -> Keys<'_, K, Id> {
101 self.map.keys()
102 }
103}
104
105impl IdIndex<String> {
106 pub fn key_vec(&self) -> Vec<&str> {
108 let mut vec = Vec::with_capacity(self.len());
109 vec.resize(self.len(), None);
110 for (k, n) in self.map.iter() {
111 let i = (n - 1) as usize;
112 assert!(vec[i].is_none());
113 vec[i] = Some(k);
114 }
115
116 let vec = vec.iter().map(|ro| ro.unwrap().as_str()).collect();
117 vec
118 }
119
120 pub fn data_frame(&self, id_col: &str, key_col: &str) -> Result<DataFrame, PolarsError> {
122 debug!("preparing data frame for index");
123 let n = self.map.len() as i32;
124 let keys = self.key_vec();
125 let ids = Int32Chunked::new(id_col, 1..(n + 1));
126 let keys = StringChunked::new(key_col, keys);
127
128 DataFrame::new(vec![ids.into_series(), keys.into_series()])
129 }
130
131 pub fn load_standard<P: AsRef<Path>>(path: P) -> Result<IdIndex<String>> {
138 IdIndex::load(path, "id", "key")
139 }
140
141 pub fn load<P: AsRef<Path>>(path: P, id_col: &str, key_col: &str) -> Result<IdIndex<String>> {
147 let path_str = path.as_ref().to_string_lossy();
148 info!("reading index from file {}", path_str);
149 let file = File::open(path.as_ref())?;
150 let frame = ParquetReader::new(file).finish()?;
151 debug!("file schema: {:?}", frame.schema());
152
153 let ic = frame.column(id_col)?.i32()?;
154 let kc = frame.column(key_col)?.str()?;
155
156 let mut map = HashMap::new();
157
158 debug!("reading file contents");
159 let iter = ic.into_iter().zip(kc.into_iter());
160 for pair in iter {
161 if let (Some(id), Some(key)) = pair {
162 map.insert(key.to_string(), id);
163 }
164 }
165
166 info!("read {} keys from {}", map.len(), path_str);
167
168 Ok(IdIndex { map, frozen: false })
169 }
170
171 #[allow(dead_code)]
176 pub fn load_csv<P: AsRef<Path>, K: Eq + Hash + DeserializeOwned>(
177 path: P,
178 ) -> Result<IdIndex<K>> {
179 info!("reading ID index from from {:?}", path.as_ref());
180 let input = csv::Reader::from_path(path)?;
181 let recs = input.into_deserialize();
182
183 let mut map = HashMap::new();
184
185 for row in recs {
186 let rec: (i32, K) = row?;
187 let (id, key) = rec;
188 map.insert(key, id);
189 }
190
191 Ok(IdIndex { map, frozen: false })
192 }
193
194 pub fn save_standard<P: AsRef<Path>>(&self, path: P) -> Result<()> {
196 self.save(path, "id", "key")
197 }
198
199 pub fn save<P: AsRef<Path>>(&self, path: P, id_col: &str, key_col: &str) -> Result<()> {
201 let mut frame = self.data_frame(id_col, key_col)?;
202
203 let path = path.as_ref();
204 info!("saving index to {:?}", path);
205 let file = File::create(path)?;
206 let writer = ParquetWriter::new(file).with_compression(ParquetCompression::Zstd(None));
207 writer.finish(&mut frame)?;
208
209 Ok(())
210 }
211}
212
213#[test]
214fn test_index_empty() {
215 let index: IdIndex<String> = IdIndex::new();
216 assert_eq!(index.len(), 0);
217 assert!(index.lookup("bob").is_none());
218}
219
220#[test]
221fn test_index_intern_one() {
222 let mut index: IdIndex<String> = IdIndex::new();
223 assert!(index.lookup("hackem muche").is_none());
224 let id = index.intern("hackem muche").expect("intern failure");
225 assert_eq!(id, 1);
226 assert_eq!(index.lookup("hackem muche").unwrap(), 1);
227}
228
229#[test]
230fn test_index_intern_two() {
231 let mut index: IdIndex<String> = IdIndex::new();
232 assert!(index.lookup("hackem muche").is_none());
233 let id = index.intern("hackem muche");
234 assert_eq!(id.expect("intern failure"), 1);
235 let id2 = index.intern("readme");
236 assert_eq!(id2.expect("intern failure"), 2);
237 assert_eq!(index.lookup("hackem muche").unwrap(), 1);
238}
239
240#[test]
241fn test_index_intern_twice() {
242 let mut index: IdIndex<String> = IdIndex::new();
243 assert!(index.lookup("hackem muche").is_none());
244 let id = index.intern("hackem muche");
245 assert_eq!(id.expect("intern failure"), 1);
246 let id2 = index.intern("hackem muche");
247 assert_eq!(id2.expect("intern failure"), 1);
248 assert_eq!(index.len(), 1);
249}
250
251#[test]
252fn test_index_intern_twice_owned() {
253 let mut index: IdIndex<String> = IdIndex::new();
254 assert!(index.lookup("hackem muche").is_none());
255 let id = index.intern_owned("hackem muche".to_owned());
256 assert!(id.is_ok());
257 assert_eq!(id.expect("intern failure"), 1);
258 let id2 = index.intern_owned("hackem muche".to_owned());
259 assert!(id2.is_ok());
260 assert_eq!(id2.expect("intern failure"), 1);
261 assert_eq!(index.len(), 1);
262}
263
264#[cfg(test)]
265#[test_log::test]
266fn test_index_save() -> Result<()> {
267 let mut index: IdIndex<String> = IdIndex::new();
268 let mut gen = Gen::new(100);
269 for _i in 0..10000 {
270 let key = String::arbitrary(&mut gen);
271 let prev = index.lookup(&key);
272 let id = index.intern(&key).expect("intern failure");
273 match prev {
274 Some(i) => assert_eq!(id, i),
275 None => assert_eq!(id as usize, index.len()),
276 };
277 }
278
279 let dir = tempdir()?;
280 let pq = dir.path().join("index.parquet");
281 index.save_standard(&pq).expect("save error");
282
283 let i2 = IdIndex::load_standard(&pq).expect("load error");
284 assert_eq!(i2.len(), index.len());
285 for (k, v) in &index.map {
286 let v2 = i2.lookup(k);
287 assert!(v2.is_some());
288 assert_eq!(v2.unwrap(), *v);
289 }
290
291 Ok(())
292}
293
294#[test]
295fn test_index_freeze() {
296 let mut index: IdIndex<String> = IdIndex::new();
297 assert!(index.lookup("hackem muche").is_none());
298 let id = index.intern("hackem muche");
299 assert!(id.is_ok());
300 assert_eq!(id.expect("intern failure"), 1);
301
302 let mut index = index.freeze();
303
304 let id = index.intern("hackem muche");
305 assert!(id.is_ok());
306 assert_eq!(id.expect("intern failure"), 1);
307
308 let id2 = index.intern("foobie bletch");
309 assert!(id2.is_err());
310}