AIVector DatabasesEmbeddingsSimilarity SearchMachine Learning

Vector Databases for AI Applications

Satyam Parmar
January 17, 2025
10 min read

Vector Databases for AI Applications

Vector databases are specialized databases designed to store and query high-dimensional vectors efficiently. They're essential for AI applications that need semantic search, recommendation systems, and similarity matching.

Understanding Vector Embeddings

What are Vector Embeddings?

Vector embeddings are numerical representations of data (text, images, audio) in a high-dimensional space where similar items are closer together.

// Example: Text embeddings
const textEmbeddings = {
  'cat': [0.1, 0.3, -0.2, 0.8, ...], // 1536 dimensions
  'dog': [0.2, 0.4, -0.1, 0.7, ...],
  'car': [-0.3, 0.1, 0.9, -0.2, ...]
};

// Similar items have similar vectors
// cat and dog are closer than cat and car

Generating Embeddings with OpenAI

const OpenAI = require('openai');

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY
});

class EmbeddingService {
  constructor() {
    this.model = 'text-embedding-3-small';
    this.dimensions = 1536;
  }
  
  async generateEmbedding(text) {
    try {
      const response = await openai.embeddings.create({
        model: this.model,
        input: text,
        encoding_format: 'float'
      });
      
      return response.data[0].embedding;
    } catch (error) {
      console.error('Embedding generation error:', error);
      throw new Error('Failed to generate embedding');
    }
  }
  
  async generateBatchEmbeddings(texts) {
    try {
      const response = await openai.embeddings.create({
        model: this.model,
        input: texts,
        encoding_format: 'float'
      });
      
      return response.data.map(item => item.embedding);
    } catch (error) {
      console.error('Batch embedding error:', error);
      throw new Error('Failed to generate batch embeddings');
    }
  }
  
  // Calculate cosine similarity
  cosineSimilarity(vecA, vecB) {
    if (vecA.length !== vecB.length) {
      throw new Error('Vectors must have the same length');
    }
    
    let dotProduct = 0;
    let normA = 0;
    let normB = 0;
    
    for (let i = 0; i < vecA.length; i++) {
      dotProduct += vecA[i] * vecB[i];
      normA += vecA[i] * vecA[i];
      normB += vecB[i] * vecB[i];
    }
    
    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
  }
}

const embeddingService = new EmbeddingService();

Vector Database Options

1. Pinecone Integration

const { Pinecone } = require('@pinecone-database/pinecone');

class PineconeService {
  constructor() {
    this.pinecone = new Pinecone({
      apiKey: process.env.PINECONE_API_KEY
    });
    this.indexName = 'ai-embeddings';
  }
  
  async initialize() {
    this.index = this.pinecone.index(this.indexName);
  }
  
  async upsertVectors(vectors) {
    try {
      const upsertRequest = {
        vectors: vectors.map(vector => ({
          id: vector.id,
          values: vector.embedding,
          metadata: vector.metadata
        }))
      };
      
      const response = await this.index.upsert(upsertRequest);
      return response;
    } catch (error) {
      console.error('Pinecone upsert error:', error);
      throw error;
    }
  }
  
  async search(queryEmbedding, options = {}) {
    try {
      const searchRequest = {
        vector: queryEmbedding,
        topK: options.topK || 10,
        includeMetadata: true,
        includeValues: false,
        filter: options.filter || {}
      };
      
      const response = await this.index.query(searchRequest);
      return response.matches;
    } catch (error) {
      console.error('Pinecone search error:', error);
      throw error;
    }
  }
  
  async deleteVectors(ids) {
    try {
      await this.index.deleteMany(ids);
    } catch (error) {
      console.error('Pinecone delete error:', error);
      throw error;
    }
  }
}

const pineconeService = new PineconeService();

2. Weaviate Integration

const weaviate = require('weaviate-client');

class WeaviateService {
  constructor() {
    this.client = weaviate.client({
      scheme: 'https',
      host: process.env.WEAVIATE_HOST,
      apiKey: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY)
    });
    this.className = 'Document';
  }
  
  async createSchema() {
    const classDefinition = {
      class: this.className,
      vectorizer: 'text2vec-openai',
      moduleConfig: {
        'text2vec-openai': {
          model: 'ada',
          modelVersion: '002',
          type: 'text'
        }
      },
      properties: [
        {
          name: 'content',
          dataType: ['text'],
          description: 'The content of the document'
        },
        {
          name: 'title',
          dataType: ['string'],
          description: 'The title of the document'
        },
        {
          name: 'category',
          dataType: ['string'],
          description: 'The category of the document'
        }
      ]
    };
    
    try {
      await this.client.schema.classCreator().withClass(classDefinition).do();
      console.log('Schema created successfully');
    } catch (error) {
      if (error.message.includes('already exists')) {
        console.log('Schema already exists');
      } else {
        throw error;
      }
    }
  }
  
  async addDocument(document) {
    try {
      const result = await this.client.data
        .creator()
        .withClass(this.className)
        .withProperties({
          content: document.content,
          title: document.title,
          category: document.category
        })
        .do();
      
      return result;
    } catch (error) {
      console.error('Weaviate add document error:', error);
      throw error;
    }
  }
  
  async searchDocuments(query, options = {}) {
    try {
      const result = await this.client.graphql
        .get()
        .withClassName(this.className)
        .withFields('content title category _additional { id distance }')
        .withNearText({
          concepts: [query],
          certainty: options.certainty || 0.7
        })
        .withLimit(options.limit || 10)
        .do();
      
      return result.data.Get[this.className];
    } catch (error) {
      console.error('Weaviate search error:', error);
      throw error;
    }
  }
}

const weaviateService = new WeaviateService();

3. Chroma Integration

const { ChromaClient } = require('chromadb');

class ChromaService {
  constructor() {
    this.client = new ChromaClient({
      path: process.env.CHROMA_URL || 'http://localhost:8000'
    });
    this.collectionName = 'documents';
  }
  
  async initialize() {
    try {
      this.collection = await this.client.getOrCreateCollection({
        name: this.collectionName,
        metadata: { description: 'Document embeddings collection' }
      });
    } catch (error) {
      console.error('Chroma initialization error:', error);
      throw error;
    }
  }
  
  async addDocuments(documents) {
    try {
      const ids = documents.map((_, index) => `doc_${Date.now()}_${index}`);
      const texts = documents.map(doc => doc.content);
      const metadatas = documents.map(doc => ({
        title: doc.title,
        category: doc.category,
        source: doc.source
      }));
      
      await this.collection.add({
        ids,
        documents: texts,
        metadatas
      });
      
      return ids;
    } catch (error) {
      console.error('Chroma add documents error:', error);
      throw error;
    }
  }
  
  async search(query, options = {}) {
    try {
      const results = await this.collection.query({
        queryTexts: [query],
        nResults: options.limit || 10,
        where: options.filter || {}
      });
      
      return results;
    } catch (error) {
      console.error('Chroma search error:', error);
      throw error;
    }
  }
}

const chromaService = new ChromaService();

Building a Semantic Search System

Document Processing Pipeline

class DocumentProcessor {
  constructor(embeddingService, vectorDB) {
    this.embeddingService = embeddingService;
    this.vectorDB = vectorDB;
  }
  
  async processDocument(document) {
    try {
      // Clean and preprocess text
      const cleanedText = this.preprocessText(document.content);
      
      // Generate embedding
      const embedding = await this.embeddingService.generateEmbedding(cleanedText);
      
      // Prepare vector data
      const vectorData = {
        id: document.id || `doc_${Date.now()}`,
        embedding,
        metadata: {
          title: document.title,
          content: cleanedText,
          category: document.category,
          source: document.source,
          createdAt: new Date().toISOString()
        }
      };
      
      // Store in vector database
      await this.vectorDB.upsertVectors([vectorData]);
      
      return vectorData;
    } catch (error) {
      console.error('Document processing error:', error);
      throw error;
    }
  }
  
  preprocessText(text) {
    return text
      .toLowerCase()
      .replace(/[^\w\s]/g, ' ') // Remove special characters
      .replace(/\s+/g, ' ') // Normalize whitespace
      .trim();
  }
  
  async processBatch(documents) {
    const results = [];
    
    for (const document of documents) {
      try {
        const result = await this.processDocument(document);
        results.push(result);
      } catch (error) {
        console.error(`Failed to process document ${document.id}:`, error);
        results.push({ id: document.id, error: error.message });
      }
    }
    
    return results;
  }
}

const documentProcessor = new DocumentProcessor(embeddingService, pineconeService);

Semantic Search API

const express = require('express');
const app = express();

// Search endpoint
app.post('/api/search', async (req, res) => {
  try {
    const { query, filters = {}, limit = 10 } = req.body;
    
    if (!query) {
      return res.status(400).json({
        error: 'Query is required'
      });
    }
    
    // Generate query embedding
    const queryEmbedding = await embeddingService.generateEmbedding(query);
    
    // Search vector database
    const results = await pineconeService.search(queryEmbedding, {
      topK: limit,
      filter: filters
    });
    
    // Format results
    const formattedResults = results.map(match => ({
      id: match.id,
      score: match.score,
      title: match.metadata.title,
      content: match.metadata.content,
      category: match.metadata.category,
      source: match.metadata.source
    }));
    
    res.json({
      success: true,
      query,
      results: formattedResults,
      total: formattedResults.length
    });
  } catch (error) {
    console.error('Search error:', error);
    res.status(500).json({
      error: 'Search failed'
    });
  }
});

// Add document endpoint
app.post('/api/documents', async (req, res) => {
  try {
    const { documents } = req.body;
    
    if (!documents || !Array.isArray(documents)) {
      return res.status(400).json({
        error: 'Documents array is required'
      });
    }
    
    const results = await documentProcessor.processBatch(documents);
    
    res.json({
      success: true,
      processed: results.length,
      results
    });
  } catch (error) {
    console.error('Document processing error:', error);
    res.status(500).json({
      error: 'Document processing failed'
    });
  }
});

Advanced Vector Operations

Hybrid Search (Vector + Keyword)

class HybridSearchService {
  constructor(vectorDB, textSearchDB) {
    this.vectorDB = vectorDB;
    this.textSearchDB = textSearchDB;
  }
  
  async hybridSearch(query, options = {}) {
    const { vectorWeight = 0.7, textWeight = 0.3 } = options;
    
    // Vector search
    const queryEmbedding = await embeddingService.generateEmbedding(query);
    const vectorResults = await this.vectorDB.search(queryEmbedding, {
      topK: options.limit || 20
    });
    
    // Text search (using PostgreSQL full-text search)
    const textResults = await this.textSearchDB.query(
      `SELECT id, title, content, ts_rank(to_tsvector('english', content), plainto_tsquery('english', $1)) as rank
       FROM documents 
       WHERE to_tsvector('english', content) @@ plainto_tsquery('english', $1)
       ORDER BY rank DESC
       LIMIT $2`,
      [query, options.limit || 20]
    );
    
    // Combine and score results
    const combinedResults = this.combineResults(
      vectorResults,
      textResults.rows,
      vectorWeight,
      textWeight
    );
    
    return combinedResults
      .sort((a, b) => b.score - a.score)
      .slice(0, options.limit || 10);
  }
  
  combineResults(vectorResults, textResults, vectorWeight, textWeight) {
    const resultMap = new Map();
    
    // Add vector results
    vectorResults.forEach(result => {
      resultMap.set(result.id, {
        id: result.id,
        title: result.metadata.title,
        content: result.metadata.content,
        vectorScore: result.score,
        textScore: 0,
        combinedScore: result.score * vectorWeight
      });
    });
    
    // Add text results
    textResults.forEach(result => {
      const existing = resultMap.get(result.id);
      if (existing) {
        existing.textScore = result.rank;
        existing.combinedScore += result.rank * textWeight;
      } else {
        resultMap.set(result.id, {
          id: result.id,
          title: result.title,
          content: result.content,
          vectorScore: 0,
          textScore: result.rank,
          combinedScore: result.rank * textWeight
        });
      }
    });
    
    return Array.from(resultMap.values());
  }
}

Vector Clustering and Categorization

class VectorClusteringService {
  constructor(vectorDB) {
    this.vectorDB = vectorDB;
  }
  
  async clusterDocuments(options = {}) {
    const { k = 5, minClusterSize = 3 } = options;
    
    // Get all vectors from database
    const allVectors = await this.getAllVectors();
    
    if (allVectors.length < k) {
      throw new Error('Not enough documents for clustering');
    }
    
    // Simple K-means clustering
    const clusters = await this.kMeansClustering(allVectors, k);
    
    // Filter clusters by minimum size
    const validClusters = clusters.filter(
      cluster => cluster.documents.length >= minClusterSize
    );
    
    return validClusters;
  }
  
  async kMeansClustering(vectors, k) {
    // Initialize centroids randomly
    const centroids = this.initializeCentroids(vectors, k);
    
    let clusters = [];
    let iterations = 0;
    const maxIterations = 100;
    
    while (iterations < maxIterations) {
      // Assign vectors to nearest centroid
      clusters = this.assignToClusters(vectors, centroids);
      
      // Update centroids
      const newCentroids = this.updateCentroids(clusters);
      
      // Check for convergence
      if (this.hasConverged(centroids, newCentroids)) {
        break;
      }
      
      centroids = newCentroids;
      iterations++;
    }
    
    return clusters;
  }
  
  initializeCentroids(vectors, k) {
    const centroids = [];
    const usedIndices = new Set();
    
    while (centroids.length < k) {
      const randomIndex = Math.floor(Math.random() * vectors.length);
      if (!usedIndices.has(randomIndex)) {
        centroids.push([...vectors[randomIndex].embedding]);
        usedIndices.add(randomIndex);
      }
    }
    
    return centroids;
  }
  
  assignToClusters(vectors, centroids) {
    const clusters = centroids.map(() => ({ documents: [], centroid: null }));
    
    vectors.forEach(vector => {
      let minDistance = Infinity;
      let closestCluster = 0;
      
      centroids.forEach((centroid, index) => {
        const distance = this.euclideanDistance(vector.embedding, centroid);
        if (distance < minDistance) {
          minDistance = distance;
          closestCluster = index;
        }
      });
      
      clusters[closestCluster].documents.push(vector);
    });
    
    return clusters;
  }
  
  updateCentroids(clusters) {
    return clusters.map(cluster => {
      if (cluster.documents.length === 0) {
        return cluster.centroid || [];
      }
      
      const dimension = cluster.documents[0].embedding.length;
      const newCentroid = new Array(dimension).fill(0);
      
      cluster.documents.forEach(doc => {
        doc.embedding.forEach((value, index) => {
          newCentroid[index] += value;
        });
      });
      
      return newCentroid.map(value => value / cluster.documents.length);
    });
  }
  
  euclideanDistance(vecA, vecB) {
    let sum = 0;
    for (let i = 0; i < vecA.length; i++) {
      sum += Math.pow(vecA[i] - vecB[i], 2);
    }
    return Math.sqrt(sum);
  }
  
  hasConverged(oldCentroids, newCentroids, threshold = 0.001) {
    for (let i = 0; i < oldCentroids.length; i++) {
      const distance = this.euclideanDistance(oldCentroids[i], newCentroids[i]);
      if (distance > threshold) {
        return false;
      }
    }
    return true;
  }
}

Performance Optimization

Batch Processing

class BatchProcessor {
  constructor(embeddingService, vectorDB) {
    this.embeddingService = embeddingService;
    this.vectorDB = vectorDB;
    this.batchSize = 100;
    this.delay = 1000; // 1 second between batches
  }
  
  async processBatch(documents) {
    const results = [];
    
    for (let i = 0; i < documents.length; i += this.batchSize) {
      const batch = documents.slice(i, i + this.batchSize);
      
      try {
        // Generate embeddings for batch
        const texts = batch.map(doc => doc.content);
        const embeddings = await this.embeddingService.generateBatchEmbeddings(texts);
        
        // Prepare vector data
        const vectors = batch.map((doc, index) => ({
          id: doc.id || `doc_${Date.now()}_${i + index}`,
          embedding: embeddings[index],
          metadata: {
            title: doc.title,
            content: doc.content,
            category: doc.category
          }
        }));
        
        // Upsert to vector database
        await this.vectorDB.upsertVectors(vectors);
        
        results.push(...vectors);
        
        // Delay between batches to avoid rate limiting
        if (i + this.batchSize < documents.length) {
          await this.delay(this.delay);
        }
      } catch (error) {
        console.error(`Batch ${i / this.batchSize + 1} failed:`, error);
        results.push(...batch.map(doc => ({ id: doc.id, error: error.message })));
      }
    }
    
    return results;
  }
  
  sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}

Conclusion

Vector databases are essential for building AI-powered applications that need:

  1. Semantic Search - Find similar content based on meaning
  2. Recommendation Systems - Suggest relevant items to users
  3. Content Categorization - Automatically organize content
  4. Question Answering - Find relevant information for queries
  5. Similarity Matching - Identify duplicate or similar items

Key considerations when choosing a vector database:

  • Scalability - Can it handle your data volume?
  • Performance - How fast are queries?
  • Features - What operations does it support?
  • Cost - What's the pricing model?
  • Integration - How easy is it to integrate?

Start with a simple implementation and gradually add complexity as your needs grow. Remember to monitor performance and costs as you scale your vector database usage.

Related Articles

Home