From 8644adfb07e34c5ba7032132fa9088f82a5e131b Mon Sep 17 00:00:00 2001 From: Bendik Aagaard Lynghaug Date: Wed, 18 Mar 2026 19:18:48 +0100 Subject: [PATCH] add lsh hashing support --- ReadMe.md | 72 ++++++ src/lib.rs | 2 + src/lsh.rs | 672 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 746 insertions(+) create mode 100644 src/lsh.rs diff --git a/ReadMe.md b/ReadMe.md index b0ccf84..44f048b 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -217,6 +217,78 @@ let keys = adaptive_probe( ); ``` +## LSH Index + +For efficient similarity search using Locality-Sensitive Hashing: + +```rust +use redoal::{Point, normalize, resample, spectral_signature, lsh::LSHIndex}; + +// Create LSH index with default parameters (4 tables, 8 dimensions, 12 projections) +let mut index = LSHIndex::new(4, 8, 12); + +// Process gesture and extract spectral embedding +let gesture = vec![ + Point::new(0.0, 0.0), + Point::new(1.0, 0.0), + Point::new(0.5, 1.0), +]; + +let norm = normalize(&gesture); +let resamp = resample(&norm, 64); +let spectral: Vec = spectral_signature(&resamp, 8) + .iter() + .map(|x| *x as f32) + .collect(); + +// Insert gesture into index +index.insert(spectral); + +// Query for similar gestures +let query = vec![ + Point::new(0.1, 0.1), + Point::new(1.1, 0.1), + Point::new(0.6, 1.1), +]; + +let norm_query = normalize(&query); +let resamp_query = resample(&norm_query, 64); +let spectral_query: Vec = spectral_signature(&resamp_query, 8) + .iter() + .map(|x| *x as f32) + .collect(); + +let results = index.query_knn(&spectral_query, 3); +for result in results { + println!("Found similar gesture: {:?}", result); +} +``` + +### LSH Features + +- **Deterministic Initialization**: Fixed seeds for reproducible results +- **Multi-table Indexing**: 4 tables by default for improved recall +- **Projection-based Hashing**: ≤64 projections (u64 bit limit) +- **L2 Distance Ranking**: Always rank results by distance +- **Multi-probe Search**: Query neighboring buckets for better results +- **Vector Normalization**: Built-in normalization support + +### LSH Usage Example with Multi-probe + +```rust +let mut index = LSHIndex::new(4, 8, 12); + +// Insert gestures +let gesture1 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02]; +let gesture2 = vec![1.1, 0.4, 0.35, 0.15, 0.12, 0.04, 0.02, 0.01]; +index.insert(gesture1); +index.insert(gesture2); + +// Query with multi-probe (checks neighboring buckets) +let query = vec![1.05, 0.45, 0.32, 0.18, 0.11, 0.045, 0.025, 0.015]; +let results = index.query_knn_multi_probe(&query, 3, 2); // k=3, probe 2 bits +``` + ## HNSW Index For fast local similarity search on peers: diff --git a/src/lib.rs b/src/lib.rs index 9a1191f..c0aa480 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,6 +43,7 @@ pub mod pca; pub mod morton; pub mod hashing; pub mod hnsw; +pub mod lsh; /// Re-export commonly used types and functions pub use point::Point; @@ -54,6 +55,7 @@ pub use pca::pca; pub use morton::morton2; pub use hashing::*; pub use hnsw::*; +pub use lsh::*; #[cfg(test)] mod tests { diff --git a/src/lsh.rs b/src/lsh.rs new file mode 100644 index 0000000..b07c818 --- /dev/null +++ b/src/lsh.rs @@ -0,0 +1,672 @@ +/// Locality-Sensitive Hashing for gesture similarity search +/// +/// This module implements projection-based LSH for efficient similarity search +/// in high-dimensional vector spaces. It's designed to work with spectral embeddings +/// from gesture data. +/// +/// # Key Features +/// - Deterministic initialization with fixed seeds +/// - Multi-table indexing for improved recall +/// - L2 distance ranking for accurate results +/// - Multi-probe search with bit-flip neighbors +/// - Normalized vector support +/// +/// # Example +/// ``` +/// use redoal::lsh::LSHIndex; +/// +/// // Create index with 4 tables, 8 dimensions, 12 projections +/// let mut index = LSHIndex::new(4, 8, 12); +/// +/// // Insert spectral embeddings +/// let gesture1 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02]; +/// let gesture2 = vec![1.1, 0.4, 0.35, 0.15, 0.12, 0.04, 0.02, 0.01]; +/// +/// index.insert(gesture1); +/// index.insert(gesture2); +/// +/// // Query for similar gestures +/// let query = vec![1.05, 0.45, 0.32, 0.18, 0.11, 0.045, 0.025, 0.015]; +/// let results = index.query_knn(&query, 3); +/// ``` +use rand::{Rng, SeedableRng}; +use rand::rngs::StdRng; +use std::collections::HashMap; +use std::collections::HashSet; + +/// Projection-based hash function for LSH +/// +/// The LSH struct contains random projections used to hash vectors. +/// Each projection is a random vector in the same dimensionality as the input. +/// The hash is computed as the sign of the dot product between the input vector +/// and each projection vector. +/// +/// # Fields +/// * `projections` - Random projection vectors [num_projections][dim] +pub struct LSH { + projections: Vec>, // [num_projections][dim] +} + +/// Single LSH table containing a hash function and buckets +/// +/// Each table has its own independent hash function (different random projections) +/// and maintains buckets of vector indices that hash to the same location. +/// +/// # Fields +/// * `hasher` - The LSH hash function for this table +/// * `buckets` - Hash map from hash values to vector indices +struct LSHTable { + hasher: LSH, + buckets: HashMap>, // hash → indices +} + +/// Multi-table LSH index for improved recall +/// +/// Uses multiple independent LSH tables to increase the probability of +/// finding similar vectors. Vectors are inserted into all tables and queried +/// across all tables to gather candidates. +/// +/// # Fields +/// * `tables` - Collection of LSH tables +/// * `data` - Storage for the actual vector data +pub struct LSHIndex { + tables: Vec, + data: Vec>, +} + +impl LSH { + /// Create a new LSH instance with random projections + /// + /// The projections are initialized deterministically using the provided seed. + /// This ensures reproducible results across different runs. + /// + /// # Arguments + /// * `dim` - Dimensionality of input vectors + /// * `num_projections` - Number of random projections (≤64) + /// * `seed` - Random seed for deterministic initialization + /// + /// # Returns + /// New LSH instance + /// + /// # Panics + /// Panics if num_projections > 64 (u64 bit limit) + pub fn new(dim: usize, num_projections: usize, seed: u64) -> Self { + assert!(num_projections <= 64, "num_projections must be ≤ 64"); + + let mut rng = StdRng::seed_from_u64(seed); + + let projections = (0..num_projections) + .map(|_| { + (0..dim) + .map(|_| rng.gen_range(-1.0..1.0)) + .collect() + }) + .collect(); + + Self { projections } + } + + /// Hash a vector using the projection-based hash function + /// + /// Computes the dot product between the input vector and each projection. + /// If the dot product is positive, the corresponding bit is set in the hash. + /// The result is a 64-bit integer where each bit represents one projection. + /// + /// # Arguments + /// * `v` - Input vector to hash + /// + /// # Returns + /// 64-bit hash value + pub fn hash(&self, v: &[f32]) -> u64 { + let mut h = 0u64; + + for (i, proj) in self.projections.iter().enumerate() { + let mut dot = 0.0; + + for j in 0..v.len() { + dot += v[j] * proj[j]; + } + + if dot > 0.0 { + h |= 1 << i; + } + } + + h + } +} + +impl LSHTable { + /// Create a new LSH table + /// + /// # Arguments + /// * `dim` - Dimensionality of input vectors + /// * `projections` - Number of random projections + /// * `seed` - Random seed for deterministic initialization + /// + /// # Returns + /// New LSH table + pub fn new(dim: usize, projections: usize, seed: u64) -> Self { + Self { + hasher: LSH::new(dim, projections, seed), + buckets: HashMap::new(), + } + } + + /// Insert a vector into the table + /// + /// Computes the hash for the vector and stores its index in the corresponding bucket. + /// + /// # Arguments + /// * `v` - Vector to insert + /// * `idx` - Index of the vector in the data storage + pub fn insert(&mut self, v: &[f32], idx: usize) { + let h = self.hasher.hash(v); + self.buckets.entry(h).or_default().push(idx); + } + + /// Query the table for similar vectors + /// + /// Returns all vectors that hash to the same location as the query vector. + /// + /// # Arguments + /// * `v` - Query vector + /// + /// # Returns + /// Vector of indices of similar vectors + pub fn query(&self, v: &[f32]) -> Vec { + let h = self.hasher.hash(v); + self.buckets.get(&h).cloned().unwrap_or_default() + } +} + +impl LSHIndex { + /// Create a new multi-table LSH index + /// + /// Each table uses a different seed (table index) to ensure independent + /// hash functions. This improves recall by increasing the probability + /// of finding similar vectors. + /// + /// # Arguments + /// * `num_tables` - Number of LSH tables + /// * `dim` - Dimensionality of input vectors + /// * `projections` - Number of projections per table + /// + /// # Returns + /// New LSH index + pub fn new(num_tables: usize, dim: usize, projections: usize) -> Self { + let tables = (0..num_tables) + .map(|i| LSHTable::new(dim, projections, i as u64)) + .collect(); + + Self { + tables, + data: Vec::new(), + } + } + + /// Insert a vector into the index + /// + /// The vector is inserted into all tables and stored in the data vector. + /// The index returned can be used to retrieve the vector later. + /// + /// # Arguments + /// * `v` - Vector to insert (will be cloned) + /// + /// # Returns + /// Index of the inserted vector + pub fn insert(&mut self, v: Vec) -> usize { + let idx = self.data.len(); + + for table in &mut self.tables { + table.insert(&v, idx); + } + + self.data.push(v); + idx + } + + /// Query the index for similar vectors (candidate retrieval) + /// + /// Returns all vectors that hash to the same location as the query vector + /// in any of the tables. This is the basic candidate retrieval without ranking. + /// + /// # Arguments + /// * `v` - Query vector + /// + /// # Returns + /// Vector of references to similar vectors + pub fn query(&self, v: &[f32]) -> Vec<&Vec> { + let mut candidates = HashSet::new(); + + for table in &self.tables { + for idx in table.query(v) { + candidates.insert(idx); + } + } + + candidates + .into_iter() + .map(|i| &self.data[i]) + .collect() + } + + /// Query the index for k nearest neighbors (with distance ranking) + /// + /// First retrieves candidate vectors using LSH, then ranks them by L2 distance + /// to the query vector. Returns the top k results. + /// + /// # Arguments + /// * `v` - Query vector + /// * `k` - Number of nearest neighbors to return + /// + /// # Returns + /// Vector of references to the k nearest neighbor vectors + pub fn query_knn(&self, v: &[f32], k: usize) -> Vec<&Vec> { + let mut candidates = self.query(v); + + candidates.sort_by(|a, b| { + l2(a, v).partial_cmp(&l2(b, v)).unwrap() + }); + + candidates.into_iter().take(k).collect() + } + + /// Query with multi-probe search (improved recall) + /// + /// In addition to the exact hash bucket, also checks neighboring buckets + /// by flipping bits in the hash. This improves recall at the cost of + /// slightly more computation. + /// + /// # Arguments + /// * `v` - Query vector + /// * `probe_bits` - Number of bit positions to probe (0 = no multi-probe) + /// + /// # Returns + /// Vector of references to similar vectors + pub fn query_multi_probe(&self, v: &[f32], probe_bits: usize) -> Vec<&Vec> { + let mut candidates = HashSet::new(); + + for table in &self.tables { + let h = table.hasher.hash(v); + + // Start with the exact hash + let mut hashes = vec![h]; + + // Add neighboring hashes by flipping bits + if probe_bits > 0 { + let num_projections = table.hasher.projections.len(); + let bits_to_probe = probe_bits.min(num_projections); + + for i in 0..bits_to_probe { + hashes.push(h ^ (1 << i)); + } + } + + // Collect all candidates from these hashes + for h2 in hashes { + if let Some(bucket) = table.buckets.get(&h2) { + for idx in bucket { + candidates.insert(*idx); + } + } + } + } + + candidates + .into_iter() + .map(|i| &self.data[i]) + .collect() + } + + /// Query k nearest neighbors with multi-probe search + /// + /// Combines multi-probe search with distance ranking for the best results. + /// + /// # Arguments + /// * `v` - Query vector + /// * `k` - Number of nearest neighbors to return + /// * `probe_bits` - Number of bit positions to probe + /// + /// # Returns + /// Vector of references to the k nearest neighbor vectors + pub fn query_knn_multi_probe(&self, v: &[f32], k: usize, probe_bits: usize) -> Vec<&Vec> { + let mut candidates = self.query_multi_probe(v, probe_bits); + + candidates.sort_by(|a, b| { + l2(a, v).partial_cmp(&l2(b, v)).unwrap() + }); + + candidates.into_iter().take(k).collect() + } +} + +/// Compute L2 (Euclidean) distance between two vectors +/// +/// # Arguments +/// * `a` - First vector +/// * `b` - Second vector +/// +/// # Returns +/// L2 distance between the vectors +pub fn l2(a: &[f32], b: &[f32]) -> f32 { + a.iter() + .zip(b.iter()) + .map(|(x, y)| (x - y) * (x - y)) + .sum::() + .sqrt() +} + +/// Normalize a vector to unit length +/// +/// Important: LSH works best with normalized vectors. This function +/// normalizes the input vector so its L2 norm is 1.0. +/// +/// # Arguments +/// * `v` - Input vector +/// +/// # Returns +/// Normalized vector (or original if empty/zero) +pub fn normalize_vector(v: &[f32]) -> Vec { + let norm = l2(v, &vec![0.0; v.len()]); + + if norm == 0.0 { + v.to_vec() + } else { + v.iter().map(|x| x / norm).collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_lsh_basic() { + let lsh = LSH::new(4, 8, 42); + let v = vec![1.0, 0.5, 0.3, 0.2]; + let h = lsh.hash(&v); + assert!(h != 0); + } + + #[test] + fn test_lsh_deterministic() { + let lsh1 = LSH::new(4, 8, 123); + let lsh2 = LSH::new(4, 8, 123); + + let v = vec![1.0, 0.5, 0.3, 0.2]; + assert_eq!(lsh1.hash(&v), lsh2.hash(&v)); + } + + #[test] + fn test_lsh_table_insert_query() { + let mut table = LSHTable::new(4, 8, 42); + + let v1 = vec![1.0, 0.5, 0.3, 0.2]; + let v2 = vec![1.1, 0.4, 0.35, 0.15]; + + table.insert(&v1, 0); + table.insert(&v2, 1); + + let results = table.query(&v1); + assert!(results.contains(&0)); + } + + #[test] + fn test_lsh_index_basic() { + let mut index = LSHIndex::new(2, 4, 8); + + let v1 = vec![1.0, 0.5, 0.3, 0.2]; + let v2 = vec![1.1, 0.4, 0.35, 0.15]; + + let idx1 = index.insert(v1); + let idx2 = index.insert(v2); + + assert_eq!(idx1, 0); + assert_eq!(idx2, 1); + assert_eq!(index.data.len(), 2); + } + + #[test] + fn test_lsh_index_query() { + let mut index = LSHIndex::new(4, 4, 8); + + let a = vec![1.0, 0.0, 0.0, 0.0]; + let b = vec![0.9, 0.1, 0.0, 0.0]; + + index.insert(a.clone()); + index.insert(b.clone()); + + let result = index.query(&a); + assert!(result.len() >= 1); + } + + #[test] + fn test_lsh_index_dissimilar_separate() { + let mut index = LSHIndex::new(4, 4, 8); + + let a = vec![1.0, 0.0, 0.0, 0.0]; + let b = vec![0.0, 1.0, 0.0, 0.0]; + + index.insert(a.clone()); + index.insert(b.clone()); + + let result = index.query(&a); + assert!(result.len() <= 2); + } + + #[test] + fn test_l2_distance() { + let a = vec![1.0, 0.0, 0.0]; + let b = vec![0.0, 1.0, 0.0]; + + let dist = l2(&a, &b); + assert!((dist - 1.4142135).abs() < 0.000001); // sqrt(2) + } + + #[test] + fn test_normalize_vector() { + let v = vec![3.0, 4.0]; + let norm = normalize_vector(&v); + + let expected_norm = l2(&norm, &vec![0.0, 0.0]); + assert!((expected_norm - 1.0).abs() < 0.000001); + } + + #[test] + fn test_query_knn() { + let mut index = LSHIndex::new(4, 4, 8); + + let v1 = vec![1.0, 0.0, 0.0, 0.0]; + let v2 = vec![1.1, 0.1, 0.0, 0.0]; + let v3 = vec![0.0, 1.0, 0.0, 0.0]; + + index.insert(v1.clone()); + index.insert(v2.clone()); + index.insert(v3.clone()); + + let query = vec![1.05, 0.05, 0.0, 0.0]; + let results = index.query_knn(&query, 2); + + assert_eq!(results.len(), 2); + // Results should be sorted by distance (v2 should be closer than v3) + let dist_v2 = l2(&v2, &query); + let dist_v3 = l2(&v3, &query); + assert!(dist_v2 < dist_v3); + } + + #[test] + fn test_multi_probe() { + let mut index = LSHIndex::new(2, 4, 8); + + let v1 = vec![1.0, 0.0, 0.0, 0.0]; + let v2 = vec![1.0, 0.0, 0.0, 0.0]; // Same as v1 + let v3 = vec![0.9, 0.1, 0.0, 0.0]; // Similar + + index.insert(v1); + index.insert(v2); + index.insert(v3); + + let query = vec![1.0, 0.0, 0.0, 0.0]; + let results = index.query_multi_probe(&query, 1); + + // Should find at least 2 similar vectors + assert!(results.len() >= 2); + } + + /// Integration test demonstrating the full pipeline: + /// gesture → spectral embedding → LSH index → query + /// + /// This test verifies that LSH works correctly with the spectral embedding + /// pipeline and can find similar gestures. + #[test] + fn test_gesture_to_lsh_pipeline() { + // Create similar gestures (triangles of different sizes) + let gesture1 = vec![ + Point::new(0.0, 0.0), + Point::new(1.0, 0.0), + Point::new(0.5, 1.0), + ]; + + let gesture2 = vec![ + Point::new(0.0, 0.0), + Point::new(1.1, 0.0), + Point::new(0.55, 1.05), + ]; + + let gesture3 = vec![ + Point::new(0.0, 0.0), + Point::new(0.9, 0.0), + Point::new(0.45, 0.95), + ]; + + // Normalize and resample all gestures + let norm1 = normalize(&gesture1); + let resamp1 = resample(&norm1, 64); + + let norm2 = normalize(&gesture2); + let resamp2 = resample(&norm2, 64); + + let norm3 = normalize(&gesture3); + let resamp3 = resample(&norm3, 64); + + // Compute spectral signatures (8 dimensions as per requirements) + let spectral1 = spectral_signature(&resamp1, 8); + let spectral2 = spectral_signature(&resamp2, 8); + let spectral3 = spectral_signature(&resamp3, 8); + + // Create LSH index with default parameters + let mut index = LSHIndex::new(4, 8, 12); + + // Insert spectral embeddings into LSH index + // Convert f64 to f32 for LSH (spectral_signature returns f64) + let spectral1_f32: Vec = spectral1.iter().map(|x| *x as f32).collect(); + let spectral2_f32: Vec = spectral2.iter().map(|x| *x as f32).collect(); + let spectral3_f32: Vec = spectral3.iter().map(|x| *x as f32).collect(); + + index.insert(spectral1_f32); + index.insert(spectral2_f32); + index.insert(spectral3_f32); + + // Create a query gesture (similar to gesture1) + let query_gesture = vec![ + Point::new(0.0, 0.0), + Point::new(1.05, 0.0), + Point::new(0.52, 1.02), + ]; + + let norm_query = normalize(&query_gesture); + let resamp_query = resample(&norm_query, 64); + let spectral_query = spectral_signature(&resamp_query, 8); + + // Convert f64 to f32 for LSH + let spectral_query_f32: Vec = spectral_query.iter().map(|x| *x as f32).collect(); + + // Query the LSH index for similar gestures + let results = index.query_knn(&spectral_query_f32, 2); + + // Should find at least 2 similar gestures + assert!(results.len() >= 2, "LSH should find similar gestures"); + + // Verify the results are actually similar (low L2 distance) + for result in &results { + let dist = l2(&spectral_query_f32, result); + assert!(dist < 0.5, "Found gestures should be similar (L2 distance < 0.5)"); + } + } + + /// Test with default parameters as specified in requirements + #[test] + fn test_default_parameters() { + let mut index = LSHIndex::new(4, 8, 12); // num_tables=4, dim=8, projections=12 + + // Insert some spectral embeddings + let v1 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02]; + let v2 = vec![1.1, 0.4, 0.35, 0.15, 0.12, 0.04, 0.02, 0.01]; + let v3 = vec![0.9, 0.6, 0.25, 0.3, 0.15, 0.06, 0.04, 0.03]; + + index.insert(v1); + index.insert(v2); + index.insert(v3); + + // Query with similar vector + let query = vec![1.05, 0.45, 0.32, 0.18, 0.11, 0.045, 0.025, 0.015]; + let results = index.query_knn(&query, 3); + + // Should find results + assert!(!results.is_empty(), "Query should return results"); + } + + /// Test that LSH works with normalized vectors + #[test] + fn test_lsh_with_normalized_vectors() { + let mut index = LSHIndex::new(4, 8, 12); + + // Create vectors of different magnitudes + let v1 = vec![2.0, 1.0, 0.6, 0.4, 0.2, 0.1, 0.06, 0.04]; + let v2 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02]; + let v3 = vec![0.5, 0.25, 0.15, 0.1, 0.05, 0.025, 0.015, 0.01]; + + // Normalize vectors before inserting + let norm_v1 = normalize_vector(&v1); + let norm_v2 = normalize_vector(&v2); + let norm_v3 = normalize_vector(&v3); + + index.insert(norm_v1); + index.insert(norm_v2); + index.insert(norm_v3); + + // Query with normalized vector + let query = vec![1.5, 0.75, 0.375, 0.25, 0.125, 0.0625, 0.03125, 0.015625]; + let norm_query = normalize_vector(&query); + + let results = index.query_knn(&norm_query, 2); + + assert!(results.len() >= 2, "Should find similar normalized vectors"); + } + + /// Test multi-probe search with spectral embeddings + #[test] + fn test_multi_probe_with_spectral() { + let mut index = LSHIndex::new(2, 8, 12); + + let v1 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02]; + let v2 = vec![1.1, 0.4, 0.35, 0.15, 0.12, 0.04, 0.02, 0.01]; + let v3 = vec![0.9, 0.6, 0.25, 0.3, 0.15, 0.06, 0.04, 0.03]; + + index.insert(v1); + index.insert(v2); + index.insert(v3); + + let query = vec![1.05, 0.45, 0.32, 0.18, 0.11, 0.045, 0.025, 0.015]; + + // Basic query + let basic_results = index.query(&query); + + // Multi-probe query (should find more results) + let probe_results = index.query_multi_probe(&query, 2); + + assert!(probe_results.len() >= basic_results.len(), + "Multi-probe should find at least as many results as basic query"); + } +}