add lsh hashing support

This commit is contained in:
2026-03-18 19:18:48 +01:00
parent 3c0111f2cb
commit 8644adfb07
3 changed files with 746 additions and 0 deletions

View File

@@ -217,6 +217,78 @@ let keys = adaptive_probe(
);
```
## LSH Index
For efficient similarity search using Locality-Sensitive Hashing:
```rust
use redoal::{Point, normalize, resample, spectral_signature, lsh::LSHIndex};
// Create LSH index with default parameters (4 tables, 8 dimensions, 12 projections)
let mut index = LSHIndex::new(4, 8, 12);
// Process gesture and extract spectral embedding
let gesture = vec![
Point::new(0.0, 0.0),
Point::new(1.0, 0.0),
Point::new(0.5, 1.0),
];
let norm = normalize(&gesture);
let resamp = resample(&norm, 64);
let spectral: Vec<f32> = spectral_signature(&resamp, 8)
.iter()
.map(|x| *x as f32)
.collect();
// Insert gesture into index
index.insert(spectral);
// Query for similar gestures
let query = vec![
Point::new(0.1, 0.1),
Point::new(1.1, 0.1),
Point::new(0.6, 1.1),
];
let norm_query = normalize(&query);
let resamp_query = resample(&norm_query, 64);
let spectral_query: Vec<f32> = spectral_signature(&resamp_query, 8)
.iter()
.map(|x| *x as f32)
.collect();
let results = index.query_knn(&spectral_query, 3);
for result in results {
println!("Found similar gesture: {:?}", result);
}
```
### LSH Features
- **Deterministic Initialization**: Fixed seeds for reproducible results
- **Multi-table Indexing**: 4 tables by default for improved recall
- **Projection-based Hashing**: ≤64 projections (u64 bit limit)
- **L2 Distance Ranking**: Always rank results by distance
- **Multi-probe Search**: Query neighboring buckets for better results
- **Vector Normalization**: Built-in normalization support
### LSH Usage Example with Multi-probe
```rust
let mut index = LSHIndex::new(4, 8, 12);
// Insert gestures
let gesture1 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02];
let gesture2 = vec![1.1, 0.4, 0.35, 0.15, 0.12, 0.04, 0.02, 0.01];
index.insert(gesture1);
index.insert(gesture2);
// Query with multi-probe (checks neighboring buckets)
let query = vec![1.05, 0.45, 0.32, 0.18, 0.11, 0.045, 0.025, 0.015];
let results = index.query_knn_multi_probe(&query, 3, 2); // k=3, probe 2 bits
```
## HNSW Index
For fast local similarity search on peers:

View File

@@ -43,6 +43,7 @@ pub mod pca;
pub mod morton;
pub mod hashing;
pub mod hnsw;
pub mod lsh;
/// Re-export commonly used types and functions
pub use point::Point;
@@ -54,6 +55,7 @@ pub use pca::pca;
pub use morton::morton2;
pub use hashing::*;
pub use hnsw::*;
pub use lsh::*;
#[cfg(test)]
mod tests {

672
src/lsh.rs Normal file
View File

@@ -0,0 +1,672 @@
/// Locality-Sensitive Hashing for gesture similarity search
///
/// This module implements projection-based LSH for efficient similarity search
/// in high-dimensional vector spaces. It's designed to work with spectral embeddings
/// from gesture data.
///
/// # Key Features
/// - Deterministic initialization with fixed seeds
/// - Multi-table indexing for improved recall
/// - L2 distance ranking for accurate results
/// - Multi-probe search with bit-flip neighbors
/// - Normalized vector support
///
/// # Example
/// ```
/// use redoal::lsh::LSHIndex;
///
/// // Create index with 4 tables, 8 dimensions, 12 projections
/// let mut index = LSHIndex::new(4, 8, 12);
///
/// // Insert spectral embeddings
/// let gesture1 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02];
/// let gesture2 = vec![1.1, 0.4, 0.35, 0.15, 0.12, 0.04, 0.02, 0.01];
///
/// index.insert(gesture1);
/// index.insert(gesture2);
///
/// // Query for similar gestures
/// let query = vec![1.05, 0.45, 0.32, 0.18, 0.11, 0.045, 0.025, 0.015];
/// let results = index.query_knn(&query, 3);
/// ```
use rand::{Rng, SeedableRng};
use rand::rngs::StdRng;
use std::collections::HashMap;
use std::collections::HashSet;
/// Projection-based hash function for LSH
///
/// The LSH struct contains random projections used to hash vectors.
/// Each projection is a random vector in the same dimensionality as the input.
/// The hash is computed as the sign of the dot product between the input vector
/// and each projection vector.
///
/// # Fields
/// * `projections` - Random projection vectors [num_projections][dim]
pub struct LSH {
projections: Vec<Vec<f32>>, // [num_projections][dim]
}
/// Single LSH table containing a hash function and buckets
///
/// Each table has its own independent hash function (different random projections)
/// and maintains buckets of vector indices that hash to the same location.
///
/// # Fields
/// * `hasher` - The LSH hash function for this table
/// * `buckets` - Hash map from hash values to vector indices
struct LSHTable {
hasher: LSH,
buckets: HashMap<u64, Vec<usize>>, // hash → indices
}
/// Multi-table LSH index for improved recall
///
/// Uses multiple independent LSH tables to increase the probability of
/// finding similar vectors. Vectors are inserted into all tables and queried
/// across all tables to gather candidates.
///
/// # Fields
/// * `tables` - Collection of LSH tables
/// * `data` - Storage for the actual vector data
pub struct LSHIndex {
tables: Vec<LSHTable>,
data: Vec<Vec<f32>>,
}
impl LSH {
/// Create a new LSH instance with random projections
///
/// The projections are initialized deterministically using the provided seed.
/// This ensures reproducible results across different runs.
///
/// # Arguments
/// * `dim` - Dimensionality of input vectors
/// * `num_projections` - Number of random projections (≤64)
/// * `seed` - Random seed for deterministic initialization
///
/// # Returns
/// New LSH instance
///
/// # Panics
/// Panics if num_projections > 64 (u64 bit limit)
pub fn new(dim: usize, num_projections: usize, seed: u64) -> Self {
assert!(num_projections <= 64, "num_projections must be ≤ 64");
let mut rng = StdRng::seed_from_u64(seed);
let projections = (0..num_projections)
.map(|_| {
(0..dim)
.map(|_| rng.gen_range(-1.0..1.0))
.collect()
})
.collect();
Self { projections }
}
/// Hash a vector using the projection-based hash function
///
/// Computes the dot product between the input vector and each projection.
/// If the dot product is positive, the corresponding bit is set in the hash.
/// The result is a 64-bit integer where each bit represents one projection.
///
/// # Arguments
/// * `v` - Input vector to hash
///
/// # Returns
/// 64-bit hash value
pub fn hash(&self, v: &[f32]) -> u64 {
let mut h = 0u64;
for (i, proj) in self.projections.iter().enumerate() {
let mut dot = 0.0;
for j in 0..v.len() {
dot += v[j] * proj[j];
}
if dot > 0.0 {
h |= 1 << i;
}
}
h
}
}
impl LSHTable {
/// Create a new LSH table
///
/// # Arguments
/// * `dim` - Dimensionality of input vectors
/// * `projections` - Number of random projections
/// * `seed` - Random seed for deterministic initialization
///
/// # Returns
/// New LSH table
pub fn new(dim: usize, projections: usize, seed: u64) -> Self {
Self {
hasher: LSH::new(dim, projections, seed),
buckets: HashMap::new(),
}
}
/// Insert a vector into the table
///
/// Computes the hash for the vector and stores its index in the corresponding bucket.
///
/// # Arguments
/// * `v` - Vector to insert
/// * `idx` - Index of the vector in the data storage
pub fn insert(&mut self, v: &[f32], idx: usize) {
let h = self.hasher.hash(v);
self.buckets.entry(h).or_default().push(idx);
}
/// Query the table for similar vectors
///
/// Returns all vectors that hash to the same location as the query vector.
///
/// # Arguments
/// * `v` - Query vector
///
/// # Returns
/// Vector of indices of similar vectors
pub fn query(&self, v: &[f32]) -> Vec<usize> {
let h = self.hasher.hash(v);
self.buckets.get(&h).cloned().unwrap_or_default()
}
}
impl LSHIndex {
/// Create a new multi-table LSH index
///
/// Each table uses a different seed (table index) to ensure independent
/// hash functions. This improves recall by increasing the probability
/// of finding similar vectors.
///
/// # Arguments
/// * `num_tables` - Number of LSH tables
/// * `dim` - Dimensionality of input vectors
/// * `projections` - Number of projections per table
///
/// # Returns
/// New LSH index
pub fn new(num_tables: usize, dim: usize, projections: usize) -> Self {
let tables = (0..num_tables)
.map(|i| LSHTable::new(dim, projections, i as u64))
.collect();
Self {
tables,
data: Vec::new(),
}
}
/// Insert a vector into the index
///
/// The vector is inserted into all tables and stored in the data vector.
/// The index returned can be used to retrieve the vector later.
///
/// # Arguments
/// * `v` - Vector to insert (will be cloned)
///
/// # Returns
/// Index of the inserted vector
pub fn insert(&mut self, v: Vec<f32>) -> usize {
let idx = self.data.len();
for table in &mut self.tables {
table.insert(&v, idx);
}
self.data.push(v);
idx
}
/// Query the index for similar vectors (candidate retrieval)
///
/// Returns all vectors that hash to the same location as the query vector
/// in any of the tables. This is the basic candidate retrieval without ranking.
///
/// # Arguments
/// * `v` - Query vector
///
/// # Returns
/// Vector of references to similar vectors
pub fn query(&self, v: &[f32]) -> Vec<&Vec<f32>> {
let mut candidates = HashSet::new();
for table in &self.tables {
for idx in table.query(v) {
candidates.insert(idx);
}
}
candidates
.into_iter()
.map(|i| &self.data[i])
.collect()
}
/// Query the index for k nearest neighbors (with distance ranking)
///
/// First retrieves candidate vectors using LSH, then ranks them by L2 distance
/// to the query vector. Returns the top k results.
///
/// # Arguments
/// * `v` - Query vector
/// * `k` - Number of nearest neighbors to return
///
/// # Returns
/// Vector of references to the k nearest neighbor vectors
pub fn query_knn(&self, v: &[f32], k: usize) -> Vec<&Vec<f32>> {
let mut candidates = self.query(v);
candidates.sort_by(|a, b| {
l2(a, v).partial_cmp(&l2(b, v)).unwrap()
});
candidates.into_iter().take(k).collect()
}
/// Query with multi-probe search (improved recall)
///
/// In addition to the exact hash bucket, also checks neighboring buckets
/// by flipping bits in the hash. This improves recall at the cost of
/// slightly more computation.
///
/// # Arguments
/// * `v` - Query vector
/// * `probe_bits` - Number of bit positions to probe (0 = no multi-probe)
///
/// # Returns
/// Vector of references to similar vectors
pub fn query_multi_probe(&self, v: &[f32], probe_bits: usize) -> Vec<&Vec<f32>> {
let mut candidates = HashSet::new();
for table in &self.tables {
let h = table.hasher.hash(v);
// Start with the exact hash
let mut hashes = vec![h];
// Add neighboring hashes by flipping bits
if probe_bits > 0 {
let num_projections = table.hasher.projections.len();
let bits_to_probe = probe_bits.min(num_projections);
for i in 0..bits_to_probe {
hashes.push(h ^ (1 << i));
}
}
// Collect all candidates from these hashes
for h2 in hashes {
if let Some(bucket) = table.buckets.get(&h2) {
for idx in bucket {
candidates.insert(*idx);
}
}
}
}
candidates
.into_iter()
.map(|i| &self.data[i])
.collect()
}
/// Query k nearest neighbors with multi-probe search
///
/// Combines multi-probe search with distance ranking for the best results.
///
/// # Arguments
/// * `v` - Query vector
/// * `k` - Number of nearest neighbors to return
/// * `probe_bits` - Number of bit positions to probe
///
/// # Returns
/// Vector of references to the k nearest neighbor vectors
pub fn query_knn_multi_probe(&self, v: &[f32], k: usize, probe_bits: usize) -> Vec<&Vec<f32>> {
let mut candidates = self.query_multi_probe(v, probe_bits);
candidates.sort_by(|a, b| {
l2(a, v).partial_cmp(&l2(b, v)).unwrap()
});
candidates.into_iter().take(k).collect()
}
}
/// Compute L2 (Euclidean) distance between two vectors
///
/// # Arguments
/// * `a` - First vector
/// * `b` - Second vector
///
/// # Returns
/// L2 distance between the vectors
pub fn l2(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y) * (x - y))
.sum::<f32>()
.sqrt()
}
/// Normalize a vector to unit length
///
/// Important: LSH works best with normalized vectors. This function
/// normalizes the input vector so its L2 norm is 1.0.
///
/// # Arguments
/// * `v` - Input vector
///
/// # Returns
/// Normalized vector (or original if empty/zero)
pub fn normalize_vector(v: &[f32]) -> Vec<f32> {
let norm = l2(v, &vec![0.0; v.len()]);
if norm == 0.0 {
v.to_vec()
} else {
v.iter().map(|x| x / norm).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lsh_basic() {
let lsh = LSH::new(4, 8, 42);
let v = vec![1.0, 0.5, 0.3, 0.2];
let h = lsh.hash(&v);
assert!(h != 0);
}
#[test]
fn test_lsh_deterministic() {
let lsh1 = LSH::new(4, 8, 123);
let lsh2 = LSH::new(4, 8, 123);
let v = vec![1.0, 0.5, 0.3, 0.2];
assert_eq!(lsh1.hash(&v), lsh2.hash(&v));
}
#[test]
fn test_lsh_table_insert_query() {
let mut table = LSHTable::new(4, 8, 42);
let v1 = vec![1.0, 0.5, 0.3, 0.2];
let v2 = vec![1.1, 0.4, 0.35, 0.15];
table.insert(&v1, 0);
table.insert(&v2, 1);
let results = table.query(&v1);
assert!(results.contains(&0));
}
#[test]
fn test_lsh_index_basic() {
let mut index = LSHIndex::new(2, 4, 8);
let v1 = vec![1.0, 0.5, 0.3, 0.2];
let v2 = vec![1.1, 0.4, 0.35, 0.15];
let idx1 = index.insert(v1);
let idx2 = index.insert(v2);
assert_eq!(idx1, 0);
assert_eq!(idx2, 1);
assert_eq!(index.data.len(), 2);
}
#[test]
fn test_lsh_index_query() {
let mut index = LSHIndex::new(4, 4, 8);
let a = vec![1.0, 0.0, 0.0, 0.0];
let b = vec![0.9, 0.1, 0.0, 0.0];
index.insert(a.clone());
index.insert(b.clone());
let result = index.query(&a);
assert!(result.len() >= 1);
}
#[test]
fn test_lsh_index_dissimilar_separate() {
let mut index = LSHIndex::new(4, 4, 8);
let a = vec![1.0, 0.0, 0.0, 0.0];
let b = vec![0.0, 1.0, 0.0, 0.0];
index.insert(a.clone());
index.insert(b.clone());
let result = index.query(&a);
assert!(result.len() <= 2);
}
#[test]
fn test_l2_distance() {
let a = vec![1.0, 0.0, 0.0];
let b = vec![0.0, 1.0, 0.0];
let dist = l2(&a, &b);
assert!((dist - 1.4142135).abs() < 0.000001); // sqrt(2)
}
#[test]
fn test_normalize_vector() {
let v = vec![3.0, 4.0];
let norm = normalize_vector(&v);
let expected_norm = l2(&norm, &vec![0.0, 0.0]);
assert!((expected_norm - 1.0).abs() < 0.000001);
}
#[test]
fn test_query_knn() {
let mut index = LSHIndex::new(4, 4, 8);
let v1 = vec![1.0, 0.0, 0.0, 0.0];
let v2 = vec![1.1, 0.1, 0.0, 0.0];
let v3 = vec![0.0, 1.0, 0.0, 0.0];
index.insert(v1.clone());
index.insert(v2.clone());
index.insert(v3.clone());
let query = vec![1.05, 0.05, 0.0, 0.0];
let results = index.query_knn(&query, 2);
assert_eq!(results.len(), 2);
// Results should be sorted by distance (v2 should be closer than v3)
let dist_v2 = l2(&v2, &query);
let dist_v3 = l2(&v3, &query);
assert!(dist_v2 < dist_v3);
}
#[test]
fn test_multi_probe() {
let mut index = LSHIndex::new(2, 4, 8);
let v1 = vec![1.0, 0.0, 0.0, 0.0];
let v2 = vec![1.0, 0.0, 0.0, 0.0]; // Same as v1
let v3 = vec![0.9, 0.1, 0.0, 0.0]; // Similar
index.insert(v1);
index.insert(v2);
index.insert(v3);
let query = vec![1.0, 0.0, 0.0, 0.0];
let results = index.query_multi_probe(&query, 1);
// Should find at least 2 similar vectors
assert!(results.len() >= 2);
}
/// Integration test demonstrating the full pipeline:
/// gesture → spectral embedding → LSH index → query
///
/// This test verifies that LSH works correctly with the spectral embedding
/// pipeline and can find similar gestures.
#[test]
fn test_gesture_to_lsh_pipeline() {
// Create similar gestures (triangles of different sizes)
let gesture1 = vec![
Point::new(0.0, 0.0),
Point::new(1.0, 0.0),
Point::new(0.5, 1.0),
];
let gesture2 = vec![
Point::new(0.0, 0.0),
Point::new(1.1, 0.0),
Point::new(0.55, 1.05),
];
let gesture3 = vec![
Point::new(0.0, 0.0),
Point::new(0.9, 0.0),
Point::new(0.45, 0.95),
];
// Normalize and resample all gestures
let norm1 = normalize(&gesture1);
let resamp1 = resample(&norm1, 64);
let norm2 = normalize(&gesture2);
let resamp2 = resample(&norm2, 64);
let norm3 = normalize(&gesture3);
let resamp3 = resample(&norm3, 64);
// Compute spectral signatures (8 dimensions as per requirements)
let spectral1 = spectral_signature(&resamp1, 8);
let spectral2 = spectral_signature(&resamp2, 8);
let spectral3 = spectral_signature(&resamp3, 8);
// Create LSH index with default parameters
let mut index = LSHIndex::new(4, 8, 12);
// Insert spectral embeddings into LSH index
// Convert f64 to f32 for LSH (spectral_signature returns f64)
let spectral1_f32: Vec<f32> = spectral1.iter().map(|x| *x as f32).collect();
let spectral2_f32: Vec<f32> = spectral2.iter().map(|x| *x as f32).collect();
let spectral3_f32: Vec<f32> = spectral3.iter().map(|x| *x as f32).collect();
index.insert(spectral1_f32);
index.insert(spectral2_f32);
index.insert(spectral3_f32);
// Create a query gesture (similar to gesture1)
let query_gesture = vec![
Point::new(0.0, 0.0),
Point::new(1.05, 0.0),
Point::new(0.52, 1.02),
];
let norm_query = normalize(&query_gesture);
let resamp_query = resample(&norm_query, 64);
let spectral_query = spectral_signature(&resamp_query, 8);
// Convert f64 to f32 for LSH
let spectral_query_f32: Vec<f32> = spectral_query.iter().map(|x| *x as f32).collect();
// Query the LSH index for similar gestures
let results = index.query_knn(&spectral_query_f32, 2);
// Should find at least 2 similar gestures
assert!(results.len() >= 2, "LSH should find similar gestures");
// Verify the results are actually similar (low L2 distance)
for result in &results {
let dist = l2(&spectral_query_f32, result);
assert!(dist < 0.5, "Found gestures should be similar (L2 distance < 0.5)");
}
}
/// Test with default parameters as specified in requirements
#[test]
fn test_default_parameters() {
let mut index = LSHIndex::new(4, 8, 12); // num_tables=4, dim=8, projections=12
// Insert some spectral embeddings
let v1 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02];
let v2 = vec![1.1, 0.4, 0.35, 0.15, 0.12, 0.04, 0.02, 0.01];
let v3 = vec![0.9, 0.6, 0.25, 0.3, 0.15, 0.06, 0.04, 0.03];
index.insert(v1);
index.insert(v2);
index.insert(v3);
// Query with similar vector
let query = vec![1.05, 0.45, 0.32, 0.18, 0.11, 0.045, 0.025, 0.015];
let results = index.query_knn(&query, 3);
// Should find results
assert!(!results.is_empty(), "Query should return results");
}
/// Test that LSH works with normalized vectors
#[test]
fn test_lsh_with_normalized_vectors() {
let mut index = LSHIndex::new(4, 8, 12);
// Create vectors of different magnitudes
let v1 = vec![2.0, 1.0, 0.6, 0.4, 0.2, 0.1, 0.06, 0.04];
let v2 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02];
let v3 = vec![0.5, 0.25, 0.15, 0.1, 0.05, 0.025, 0.015, 0.01];
// Normalize vectors before inserting
let norm_v1 = normalize_vector(&v1);
let norm_v2 = normalize_vector(&v2);
let norm_v3 = normalize_vector(&v3);
index.insert(norm_v1);
index.insert(norm_v2);
index.insert(norm_v3);
// Query with normalized vector
let query = vec![1.5, 0.75, 0.375, 0.25, 0.125, 0.0625, 0.03125, 0.015625];
let norm_query = normalize_vector(&query);
let results = index.query_knn(&norm_query, 2);
assert!(results.len() >= 2, "Should find similar normalized vectors");
}
/// Test multi-probe search with spectral embeddings
#[test]
fn test_multi_probe_with_spectral() {
let mut index = LSHIndex::new(2, 8, 12);
let v1 = vec![1.0, 0.5, 0.3, 0.2, 0.1, 0.05, 0.03, 0.02];
let v2 = vec![1.1, 0.4, 0.35, 0.15, 0.12, 0.04, 0.02, 0.01];
let v3 = vec![0.9, 0.6, 0.25, 0.3, 0.15, 0.06, 0.04, 0.03];
index.insert(v1);
index.insert(v2);
index.insert(v3);
let query = vec![1.05, 0.45, 0.32, 0.18, 0.11, 0.045, 0.025, 0.015];
// Basic query
let basic_results = index.query(&query);
// Multi-probe query (should find more results)
let probe_results = index.query_multi_probe(&query, 2);
assert!(probe_results.len() >= basic_results.len(),
"Multi-probe should find at least as many results as basic query");
}
}