Vector Database API
The vector database provides semantic search capabilities for message history and context retrieval using embeddings.
Initialization
Basic Setup
local vector_db = VectorDB.new({
db_path = "./data/buidl.db",
privacy_level = "high"
})
Configuration Options
local config = {
db_path = "./data/buidl.db", -- Database file location
privacy_level = "high", -- high, medium, low
max_messages = 10000, -- Maximum messages to store
embedding_dimensions = 384, -- Embedding vector dimensions
similarity_threshold = 0.7, -- Minimum similarity for matches
use_lsh_index = true, -- Enable LSH indexing for performance
lsh_tables = 10, -- Number of LSH tables
lsh_hash_size = 10 -- LSH hash size
}
local vector_db = VectorDB.new(config)
Core Operations
Adding Messages
-- Add single message
vector_db:add_message({
text = "How do I deploy to production?",
user = "U1234567890",
channel = "C1234567890",
timestamp = "1704067200.123456",
thread_ts = "1704067100.123456" -- Optional: thread timestamp
})
-- Add message with metadata
vector_db:add_message({
text = "The deployment failed with error code 500",
user = "U1234567890",
channel = "C1234567890",
timestamp = "1704067200.123456",
metadata = {
urgency = "high",
category = "deployment",
tags = {"error", "production"}
}
})
Searching Messages
-- Basic search
local results = vector_db:search("deployment help", {
limit = 5,
channel = "C1234567890"
})
-- Advanced search with filters
local results = vector_db:search("database error", {
limit = 10,
channel = "C1234567890",
user = "U1234567890", -- Filter by user
time_range = {
start = "1704000000.000000",
end = "1704086400.000000"
},
min_similarity = 0.8, -- Higher similarity threshold
include_metadata = true -- Include metadata in results
})
Search Results Format
{
{
text = "To deploy to production, run: npm run deploy:prod",
user = "U1234567890",
channel = "C1234567890",
timestamp = "1704067200.123456",
similarity = 0.92,
metadata = {
category = "deployment",
tags = {"production", "npm"}
}
},
-- More results...
}
Embedding Management
Privacy Levels
High Privacy (Local Only)
local embeddings = PrivacyConsciousEmbeddings.new({
privacy_level = "high",
local_model = "sentence-transformers/all-MiniLM-L6-v2"
})
Medium Privacy (Filtered External)
local embeddings = PrivacyConsciousEmbeddings.new({
privacy_level = "medium",
pii_filter = true,
api_endpoint = "https://api.openai.com/v1/embeddings"
})
Low Privacy (Full External)
local embeddings = PrivacyConsciousEmbeddings.new({
privacy_level = "low",
api_endpoint = "https://api.openai.com/v1/embeddings",
model = "text-embedding-ada-002"
})
Custom Embedding Providers
local CustomEmbedding = {}
function CustomEmbedding.new(config)
local self = setmetatable({}, CustomEmbedding)
self.config = config
return self
end
function CustomEmbedding:generate_embedding(text)
-- Your custom embedding logic
local embedding = your_embedding_function(text)
return embedding
end
-- Register custom provider
vector_db:set_embedding_provider(CustomEmbedding.new(config))
Performance Optimization
LSH Indexing
-- Enable LSH for sub-linear search
local vector_db = VectorDB.new({
db_path = "./data/buidl.db",
use_lsh_index = true,
lsh_tables = 20, -- More tables = better recall
lsh_hash_size = 12 -- Larger hash = more precision
})
-- Search with LSH
local results = vector_db:search_lsh("deployment error", {
limit = 10,
candidate_limit = 100 -- LSH candidates to evaluate
})
Batch Operations
-- Add multiple messages efficiently
local messages = {
{text = "Message 1", user = "U1", channel = "C1", timestamp = "1.1"},
{text = "Message 2", user = "U2", channel = "C1", timestamp = "1.2"},
-- ... more messages
}
vector_db:add_batch(messages, {
batch_size = 100, -- Process in batches
parallel = true -- Enable parallel processing
})
Indexing Strategies
-- Create specialized indexes
vector_db:create_index("channel", "channel")
vector_db:create_index("user", "user")
vector_db:create_index("timestamp", "timestamp")
vector_db:create_index("composite", {"channel", "user"})
-- Use indexes in queries
local results = vector_db:search("help", {
use_index = "channel",
channel = "C1234567890"
})
Database Management
Statistics and Monitoring
local stats = vector_db:get_stats()
print("Messages stored: " .. stats.total_messages)
print("Database size: " .. stats.size_mb .. " MB")
print("Average similarity: " .. stats.avg_similarity)
print("Index efficiency: " .. stats.index_efficiency)
Maintenance Operations
-- Optimize database
vector_db:optimize()
-- Rebuild indexes
vector_db:rebuild_indexes()
-- Cleanup old messages
vector_db:cleanup({
older_than = "30d", -- Remove messages older than 30 days
keep_minimum = 1000 -- But keep at least 1000 messages
})
-- Vacuum database
vector_db:vacuum()
Backup and Restore
-- Create backup
vector_db:backup("./backups/buidl_" .. os.date("%Y%m%d") .. ".db")
-- Restore from backup
vector_db:restore("./backups/buidl_20240115.db")
-- Export to JSON
vector_db:export_json("./exports/messages.json", {
include_embeddings = false,
anonymize_users = true
})
Data Privacy
PII Detection and Filtering
local pii_filter = PIIFilter.new({
patterns = {
email = "[%w%._%+%-]+@[%w%.%-]+%.%w+",
phone = "%d%d%d%-%d%d%d%-%d%d%d%d",
ssn = "%d%d%d%-%d%d%-%d%d%d%d",
credit_card = "%d%d%d%d%s%d%d%d%d%s%d%d%d%d%s%d%d%d%d"
},
replacement = "[REDACTED]",
log_detections = true
})
-- Filter text before embedding
local filtered_text = pii_filter:filter(original_text)
vector_db:add_message({
text = filtered_text,
original_hash = hash(original_text) -- For verification
})
Data Retention Policies
-- Set retention policy
vector_db:set_retention_policy({
default_retention = "90d",
channel_policies = {
["C_PRIVATE"] = "7d", -- Private channels: 7 days
["C_PUBLIC"] = "365d", -- Public channels: 1 year
["C_ARCHIVE"] = "never" -- Archive channels: never delete
},
user_preferences = {
["U_GDPR_USER"] = "30d" -- GDPR user: 30 days max
}
})
Error Handling
Database Errors
local success, error_msg = pcall(function()
vector_db:add_message(message)
end)
if not success then
if error_msg:match("database locked") then
-- Retry with exponential backoff
retry_with_backoff(function()
vector_db:add_message(message)
end)
elseif error_msg:match("disk full") then
-- Cleanup old data
vector_db:cleanup({older_than = "7d"})
else
-- Log error
logger:error("Database error: " .. error_msg)
end
end
Embedding Errors
local function safe_generate_embedding(text)
local success, embedding = pcall(function()
return embedding_provider:generate(text)
end)
if success then
return embedding
else
-- Fallback to simple TF-IDF
return tfidf_embedding(text)
end
end
Examples
Basic Message Storage and Search
-- Initialize database
local db = VectorDB.new({db_path = "./messages.db"})
-- Add some messages
db:add_message({
text = "How do I restart the service?",
user = "alice",
channel = "devops"
})
db:add_message({
text = "sudo systemctl restart myservice",
user = "bob",
channel = "devops"
})
-- Search for help
local results = db:search("restart service", {limit = 5})
for _, result in ipairs(results) do
print(result.text .. " (similarity: " .. result.similarity .. ")")
end
Advanced Context Retrieval
-- Context-aware search for AI responses
function get_conversation_context(query, channel, max_context)
local recent_messages = db:search("", {
channel = channel,
time_range = {
start = tostring(os.time() - 3600), -- Last hour
end = tostring(os.time())
},
limit = 20,
sort_by = "timestamp"
})
local relevant_messages = db:search(query, {
channel = channel,
limit = max_context or 5,
min_similarity = 0.6
})
-- Combine and deduplicate
local context = {}
local seen = {}
for _, msg in ipairs(relevant_messages) do
if not seen[msg.timestamp] then
table.insert(context, msg)
seen[msg.timestamp] = true
end
end
return context
end