1 unstable release
Uses new Rust 2024
new 0.1.0 | Apr 2, 2025 |
---|
#14 in #hnsw
59 downloads per month
64KB
1K
SLoC
Anda-DB HNSW Vector Search Library
A high-performance implementation of Hierarchical Navigable Small World (HNSW) algorithm for approximate nearest neighbor search in high-dimensional spaces.
Features
- Fast approximate nearest neighbor search
- Multiple distance metrics (Euclidean, Cosine, Inner Product, Manhattan)
- Configurable index parameters
- Thread-safe implementation with concurrent read/write operations
- Serialization and deserialization support
- Support for bf16 (brain floating point) vector storage for memory efficiency
Installation
Add this to your Cargo.toml
:
[dependencies]
anda_db_hnsw = "0.1.0"
Quick Start
use anda_db_hnsw::{HnswConfig, HnswIndex};
// Create a new index for 384-dimensional vectors
const DIM: usize = 384;
let config = HnswConfig::default();
let index = HnswIndex::new(DIM, config);
// Insert vectors
let vector1: Vec<f32> = vec![0.1, 0.2, /* ... */]; // 384 dimensions
index.insert_f32(1, vector1)?;
// Search for nearest neighbors
let query: Vec<f32> = vec![0.15, 0.25, /* ... */]; // 384 dimensions
let results = index.search_f32(&query, 10)?;
// Results contain (id, distance) pairs
for (id, distance) in results {
println!("ID: {}, Distance: {}", id, distance);
}
// Save index to file
let mut file = std::fs::File::create("index.cbor")?;
index.save(&mut file)?;
// Load index from file
let file = std::fs::File::open("index.cbor")?;
let loaded_index = HnswIndex::load(&file)?;
Advanced Usage
Custom Configuration
use anda_db_hnsw::{HnswConfig, HnswIndex, DistanceMetric, SelectNeighborsStrategy};
let config = HnswConfig {
max_layers: 16, // Maximum number of layers
max_connections: 32, // Maximum connections per node
ef_construction: 200, // Expansion factor during construction
ef_search: 50, // Candidates to consider during search
distance_metric: DistanceMetric::Cosine, // Distance metric
max_elements: Some(1_000_000), // Optional capacity limit
scale_factor: Some(1.2), // Layer distribution scaling
select_neighbors_strategy: SelectNeighborsStrategy::Heuristic, // Neighbor selection strategy
};
let index = HnswIndex::new(384, config);
Distance Metrics
The library supports multiple distance metrics:
// Euclidean distance (L2 norm)
let config = HnswConfig {
distance_metric: DistanceMetric::Euclidean,
..Default::default()
};
// Cosine distance (1 - cosine similarity)
let config = HnswConfig {
distance_metric: DistanceMetric::Cosine,
..Default::default()
};
// Inner product (negative dot product)
let config = HnswConfig {
distance_metric: DistanceMetric::InnerProduct,
..Default::default()
};
// Manhattan distance (L1 norm)
let config = HnswConfig {
distance_metric: DistanceMetric::Manhattan,
..Default::default()
};
Removing Vectors
// Remove a vector by ID
let removed = index.remove(vector_id)?;
Index Statistics
// Get index statistics
let stats = index.stats();
println!("Total vectors: {}", stats.num_elements);
println!("Deleted vectors: {}", stats.num_deleted);
println!("Max layer: {}", stats.max_layer);
println!("Avg connections: {:.2}", stats.avg_connections);
println!("Search operations: {}", stats.search_count);
println!("Insert operations: {}", stats.insert_count);
println!("Delete operations: {}", stats.delete_count);
Example
Here's a complete example demonstrating vector insertion, search, and deletion:
use anda_db_hnsw::{HnswConfig, HnswIndex};
use rand::Rng;
use tokio::time;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
const DIM: usize = 384;
// Create index (384-dimensional vectors, like BERT embeddings)
let config = HnswConfig::default();
let index = HnswIndex::new(DIM, config);
// Simulate data stream
let mut rng = rand::thread_rng();
let mut insert_start = time::Instant::now();
for i in 0..100_000 {
// Generate random vector
let vector: Vec<f32> = (0..DIM).map(|_| rng.gen::<f32>()).collect();
let _ = index.insert_f32(i as u64, vector)?;
// Simulate search queries
if i % 100 == 0 {
println!("{} inserted 100 vectors in {:?}", i, insert_start.elapsed());
insert_start = time::Instant::now();
let query: Vec<f32> = (0..DIM).map(|_| rng.gen::<f32>()).collect();
let query_start = time::Instant::now();
let results = index.search_f32(&query, 10)?;
println!(
"{} Search returned {} results in {:?}",
i,
results.len(),
query_start.elapsed()
);
}
// Simulate deletion
if i % 1000 == 0 && i > 0 {
let to_remove = rng.gen_range(0..i);
let remove_start = time::Instant::now();
index.remove(to_remove as u64)?;
println!(
"{} Removed vector {} in {:?}",
i,
to_remove,
remove_start.elapsed()
);
}
}
// Print statistics
let stats = index.stats();
println!("Index statistics:");
println!("- Total vectors: {}", stats.num_elements);
println!("- Deleted vectors: {}", stats.num_deleted);
println!("- Max layer: {}", stats.max_layer);
println!("- Avg connections: {:.2}", stats.avg_connections);
println!("- Search operations: {}", stats.search_count);
println!("- Insert operations: {}", stats.insert_count);
println!("- Delete operations: {}", stats.delete_count);
// Save and load
{
let mut file = std::fs::File::create("hnsw_demo.cbor")?;
let save_start = time::Instant::now();
index.save(&mut file)?;
println!("Saved index in {:?}", save_start.elapsed());
}
let file = std::fs::File::open("hnsw_demo.cbor")?;
let load_start = time::Instant::now();
let loaded_index = HnswIndex::load(&file)?;
println!("Loaded index in {:?}", load_start.elapsed());
// Test search on loaded index
let query: Vec<f32> = (0..DIM).map(|_| rng.gen::<f32>()).collect();
let query_start = time::Instant::now();
let results = loaded_index.search_f32(&query, 10)?;
println!(
"Search returned {} results in {:?}",
results.len(),
query_start.elapsed()
);
Ok(())
}
Error Handling
The library uses a custom error type HnswError
for various error conditions:
HnswError::Db
: Database-related errorsHnswError::Cbor
: Serialization/deserialization errorsHnswError::DimensionMismatch
: When vector dimensions don't match the indexHnswError::EmptyIndex
: When trying to search an empty indexHnswError::IndexFull
: When index has reached its maximum capacityHnswError::NotFound
: When a vector with the specified ID is not foundHnswError::AlreadyExists
: When trying to add a vector with an ID that already existsHnswError::DistanceMetric
: Errors related to distance calculations
Performance Considerations
- The HNSW algorithm provides logarithmic search complexity with respect to the number of vectors
- The
ef_search
parameter controls the trade-off between search speed and accuracy - The
ef_construction
parameter affects build time and quality of the graph structure - The
SelectNeighborsStrategy::Heuristic
option provides better search quality at the cost of longer index construction time - Using bf16 format for vector storage significantly reduces memory usage with minimal precision loss
License
Copyright © 2025 LDC Labs.
ldclabs/anda-db
is licensed under the MIT License. See LICENSE for the full license text.
Dependencies
~3.5–9MB
~77K SLoC