Large Language Model Integration Guide for Backend Engineers

Integrating Large Language Models (LLMs) into backend applications opens up new possibilities for intelligent features. This comprehensive guide covers everything from basic integration to production-ready implementations.

Understanding LLM APIs

OpenAI API Integration

const OpenAI = require('openai');

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY,
  timeout: 30000, // 30 seconds
  maxRetries: 3
});

// Basic text completion
async function generateText(prompt, options = {}) {
  try {
    const completion = await openai.chat.completions.create({
      model: options.model || 'gpt-3.5-turbo',
      messages: [
        { role: 'system', content: options.systemPrompt || 'You are a helpful assistant.' },
        { role: 'user', content: prompt }
      ],
      max_tokens: options.maxTokens || 1000,
      temperature: options.temperature || 0.7,
      top_p: options.topP || 1,
      frequency_penalty: options.frequencyPenalty || 0,
      presence_penalty: options.presencePenalty || 0
    });
    
    return {
      text: completion.choices[0].message.content,
      usage: completion.usage,
      model: completion.model
    };
  } catch (error) {
    console.error('OpenAI API error:', error);
    throw new Error('Failed to generate text');
  }
}

Anthropic Claude Integration

const Anthropic = require('@anthropic-ai/sdk');

const anthropic = new Anthropic({
  apiKey: process.env.ANTHROPIC_API_KEY
});

async function generateWithClaude(prompt, options = {}) {
  try {
    const response = await anthropic.messages.create({
      model: options.model || 'claude-3-sonnet-20240229',
      max_tokens: options.maxTokens || 1000,
      temperature: options.temperature || 0.7,
      messages: [
        { role: 'user', content: prompt }
      ]
    });
    
    return {
      text: response.content[0].text,
      usage: response.usage,
      model: response.model
    };
  } catch (error) {
    console.error('Anthropic API error:', error);
    throw new Error('Failed to generate text with Claude');
  }
}

Prompt Engineering Best Practices

Structured Prompt Templates

class PromptTemplate {
  constructor(template, variables = {}) {
    this.template = template;
    this.variables = variables;
  }
  
  format(context = {}) {
    let prompt = this.template;
    
    // Replace variables in template
    Object.entries({ ...this.variables, ...context }).forEach(([key, value]) => {
      const placeholder = `{{${key}}}`;
      prompt = prompt.replace(new RegExp(placeholder, 'g'), value);
    });
    
    return prompt;
  }
}

// Email generation template
const emailTemplate = new PromptTemplate(`
You are a professional email assistant. Generate a {{tone}} email based on the following context:

**Recipient:** {{recipientName}}
**Subject:** {{subject}}
**Key Points:** {{keyPoints}}
**Tone:** {{tone}}
**Length:** {{length}}

Please generate an appropriate email that:
- Addresses the recipient professionally
- Covers all key points
- Matches the requested tone
- Is approximately {{length}} words

Email:
`);

// Usage
const emailPrompt = emailTemplate.format({
  recipientName: 'John Smith',
  subject: 'Project Update',
  keyPoints: 'Project is on track, milestone achieved, next steps planned',
  tone: 'professional and positive',
  length: '150-200'
});

Few-Shot Learning Patterns

class FewShotPromptBuilder {
  constructor() {
    this.examples = [];
  }
  
  addExample(input, output) {
    this.examples.push({ input, output });
    return this;
  }
  
  build(task, newInput) {
    let prompt = `Task: ${task}\n\n`;
    
    // Add examples
    this.examples.forEach((example, index) => {
      prompt += `Example ${index + 1}:\n`;
      prompt += `Input: ${example.input}\n`;
      prompt += `Output: ${example.output}\n\n`;
    });
    
    // Add new input
    prompt += `Now, please complete the task for this input:\n`;
    prompt += `Input: ${newInput}\n`;
    prompt += `Output:`;
    
    return prompt;
  }
}

// Usage for text classification
const classifier = new FewShotPromptBuilder()
  .addExample(
    'I love this product!',
    'Positive'
  )
  .addExample(
    'This is terrible quality.',
    'Negative'
  )
  .addExample(
    'The product is okay, nothing special.',
    'Neutral'
  );

const classificationPrompt = classifier.build(
  'Classify the sentiment of customer reviews',
  'The service was average, could be better.'
);

Backend API Design for LLM Integration

RESTful LLM Service

const express = require('express');
const rateLimit = require('express-rate-limit');
const app = express();

// Rate limiting for LLM endpoints
const llmLimiter = rateLimit({
  windowMs: 15 * 60 * 1000, // 15 minutes
  max: 10, // limit each IP to 10 requests per windowMs
  message: {
    error: 'Too many LLM requests, please try again later.'
  }
});

// Text generation endpoint
app.post('/api/generate/text', llmLimiter, async (req, res) => {
  try {
    const { prompt, options = {} } = req.body;
    
    if (!prompt) {
      return res.status(400).json({
        error: 'Prompt is required'
      });
    }
    
    const result = await generateText(prompt, options);
    
    res.json({
      success: true,
      data: {
        text: result.text,
        model: result.model,
        usage: result.usage
      }
    });
  } catch (error) {
    console.error('Text generation error:', error);
    res.status(500).json({
      error: 'Failed to generate text'
    });
  }
});

// Chat completion endpoint
app.post('/api/chat/completions', llmLimiter, async (req, res) => {
  try {
    const { messages, options = {} } = req.body;
    
    if (!messages || !Array.isArray(messages)) {
      return res.status(400).json({
        error: 'Messages array is required'
      });
    }
    
    const completion = await openai.chat.completions.create({
      model: options.model || 'gpt-3.5-turbo',
      messages,
      max_tokens: options.maxTokens || 1000,
      temperature: options.temperature || 0.7
    });
    
    res.json({
      success: true,
      data: {
        message: completion.choices[0].message,
        usage: completion.usage,
        model: completion.model
      }
    });
  } catch (error) {
    console.error('Chat completion error:', error);
    res.status(500).json({
      error: 'Failed to complete chat'
    });
  }
});

Streaming Responses

// Streaming text generation
app.post('/api/generate/stream', llmLimiter, async (req, res) => {
  try {
    const { prompt, options = {} } = req.body;
    
    res.setHeader('Content-Type', 'text/plain');
    res.setHeader('Transfer-Encoding', 'chunked');
    
    const stream = await openai.chat.completions.create({
      model: options.model || 'gpt-3.5-turbo',
      messages: [{ role: 'user', content: prompt }],
      max_tokens: options.maxTokens || 1000,
      temperature: options.temperature || 0.7,
      stream: true
    });
    
    for await (const chunk of stream) {
      const content = chunk.choices[0]?.delta?.content || '';
      if (content) {
        res.write(content);
      }
    }
    
    res.end();
  } catch (error) {
    console.error('Streaming error:', error);
    res.status(500).json({ error: 'Streaming failed' });
  }
});

Caching and Optimization

LLM Response Caching

const Redis = require('redis');
const crypto = require('crypto');

class LLMCache {
  constructor() {
    this.redis = Redis.createClient({
      host: process.env.REDIS_HOST,
      port: process.env.REDIS_PORT
    });
  }
  
  generateCacheKey(prompt, options) {
    const keyData = {
      prompt,
      model: options.model || 'gpt-3.5-turbo',
      temperature: options.temperature || 0.7,
      maxTokens: options.maxTokens || 1000
    };
    
    const hash = crypto
      .createHash('sha256')
      .update(JSON.stringify(keyData))
      .digest('hex');
    
    return `llm:${hash}`;
  }
  
  async get(prompt, options) {
    try {
      const key = this.generateCacheKey(prompt, options);
      const cached = await this.redis.get(key);
      return cached ? JSON.parse(cached) : null;
    } catch (error) {
      console.error('Cache get error:', error);
      return null;
    }
  }
  
  async set(prompt, options, result, ttl = 3600) {
    try {
      const key = this.generateCacheKey(prompt, options);
      await this.redis.setex(key, ttl, JSON.stringify(result));
    } catch (error) {
      console.error('Cache set error:', error);
    }
  }
}

const llmCache = new LLMCache();

// Cached text generation
async function generateTextCached(prompt, options = {}) {
  // Check cache first
  const cached = await llmCache.get(prompt, options);
  if (cached) {
    console.log('Cache hit for prompt');
    return cached;
  }
  
  // Generate new content
  const result = await generateText(prompt, options);
  
  // Cache the result
  await llmCache.set(prompt, options, result, 1800); // 30 minutes
  
  return result;
}

Error Handling and Resilience

Retry Logic with Exponential Backoff

class LLMService {
  constructor() {
    this.maxRetries = 3;
    this.baseDelay = 1000; // 1 second
  }
  
  async generateWithRetry(prompt, options = {}) {
    let lastError;
    
    for (let attempt = 1; attempt <= this.maxRetries; attempt++) {
      try {
        return await this.generateText(prompt, options);
      } catch (error) {
        lastError = error;
        
        // Don't retry on certain errors
        if (this.isNonRetryableError(error)) {
          throw error;
        }
        
        // Calculate delay with exponential backoff
        const delay = this.baseDelay * Math.pow(2, attempt - 1);
        
        console.warn(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message);
        
        if (attempt < this.maxRetries) {
          await this.sleep(delay);
        }
      }
    }
    
    throw new Error(`Failed after ${this.maxRetries} attempts: ${lastError.message}`);
  }
  
  isNonRetryableError(error) {
    // Don't retry on authentication errors or invalid requests
    return error.status === 401 || error.status === 400;
  }
  
  sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
  
  async generateText(prompt, options) {
    // Your LLM generation logic here
    return await generateText(prompt, options);
  }
}

Production Considerations

Cost Management

class CostTracker {
  constructor() {
    this.dailyUsage = new Map();
    this.monthlyBudget = 1000; // $1000 monthly budget
  }
  
  calculateCost(model, usage) {
    const pricing = {
      'gpt-3.5-turbo': {
        input: 0.0015 / 1000,  // $0.0015 per 1K tokens
        output: 0.002 / 1000   // $0.002 per 1K tokens
      },
      'gpt-4': {
        input: 0.03 / 1000,    // $0.03 per 1K tokens
        output: 0.06 / 1000    // $0.06 per 1K tokens
      }
    };
    
    const modelPricing = pricing[model] || pricing['gpt-3.5-turbo'];
    const inputCost = (usage.prompt_tokens || 0) * modelPricing.input;
    const outputCost = (usage.completion_tokens || 0) * modelPricing.output;
    
    return inputCost + outputCost;
  }
  
  trackUsage(model, usage) {
    const cost = this.calculateCost(model, usage);
    const today = new Date().toISOString().split('T')[0];
    
    if (!this.dailyUsage.has(today)) {
      this.dailyUsage.set(today, 0);
    }
    
    const dailyTotal = this.dailyUsage.get(today) + cost;
    this.dailyUsage.set(today, dailyTotal);
    
    // Check budget
    if (dailyTotal > this.monthlyBudget / 30) {
      console.warn(`Daily budget exceeded: $${dailyTotal.toFixed(4)}`);
    }
    
    return cost;
  }
  
  getDailyUsage(date = new Date().toISOString().split('T')[0]) {
    return this.dailyUsage.get(date) || 0;
  }
}

const costTracker = new CostTracker();

// Track costs in your generation function
async function generateTextWithCostTracking(prompt, options = {}) {
  const result = await generateText(prompt, options);
  
  // Track cost
  const cost = costTracker.trackUsage(result.model, result.usage);
  console.log(`Generation cost: $${cost.toFixed(4)}`);
  
  return result;
}

Monitoring and Analytics

class LLMAnalytics {
  constructor() {
    this.metrics = {
      totalRequests: 0,
      totalTokens: 0,
      totalCost: 0,
      averageResponseTime: 0,
      errorRate: 0,
      modelUsage: {}
    };
  }
  
  recordRequest(model, usage, responseTime, success = true) {
    this.metrics.totalRequests++;
    this.metrics.totalTokens += (usage.prompt_tokens || 0) + (usage.completion_tokens || 0);
    
    if (success) {
      this.metrics.averageResponseTime = 
        (this.metrics.averageResponseTime + responseTime) / 2;
    } else {
      this.metrics.errorRate = (this.metrics.errorRate + 1) / this.metrics.totalRequests;
    }
    
    this.metrics.modelUsage[model] = (this.metrics.modelUsage[model] || 0) + 1;
  }
  
  getMetrics() {
    return {
      ...this.metrics,
      averageTokensPerRequest: this.metrics.totalTokens / this.metrics.totalRequests,
      successRate: 1 - this.metrics.errorRate
    };
  }
}

const analytics = new LLMAnalytics();

// Analytics endpoint
app.get('/api/llm/analytics', (req, res) => {
  res.json(analytics.getMetrics());
});

Security Considerations

Input Sanitization

class LLMSecurity {
  static sanitizeInput(input) {
    // Remove potentially harmful content
    return input
      .replace(/<script[^>]*>.*?<\/script>/gi, '') // Remove scripts
      .replace(/<[^>]*>/g, '') // Remove HTML tags
      .trim()
      .substring(0, 10000); // Limit length
  }
  
  static validatePrompt(prompt) {
    const issues = [];
    
    if (!prompt || prompt.length < 3) {
      issues.push('Prompt too short');
    }
    
    if (prompt.length > 10000) {
      issues.push('Prompt too long');
    }
    
    // Check for potential injection attempts
    const suspiciousPatterns = [
      /ignore previous instructions/i,
      /forget everything/i,
      /you are now/i
    ];
    
    suspiciousPatterns.forEach(pattern => {
      if (pattern.test(prompt)) {
        issues.push('Suspicious prompt pattern detected');
      }
    });
    
    return {
      isValid: issues.length === 0,
      issues
    };
  }
}

// Secure text generation
async function generateTextSecure(prompt, options = {}) {
  // Sanitize input
  const sanitizedPrompt = LLMSecurity.sanitizeInput(prompt);
  
  // Validate prompt
  const validation = LLMSecurity.validatePrompt(sanitizedPrompt);
  if (!validation.isValid) {
    throw new Error(`Invalid prompt: ${validation.issues.join(', ')}`);
  }
  
  return await generateText(sanitizedPrompt, options);
}

Conclusion

Integrating LLMs into backend applications requires careful consideration of:

API Design - Create clean, consistent interfaces
Prompt Engineering - Use templates and few-shot learning
Caching - Implement intelligent caching strategies
Error Handling - Build resilient systems with retry logic
Cost Management - Monitor and control API usage costs
Security - Sanitize inputs and validate prompts
Monitoring - Track performance and usage metrics

Start with simple integrations and gradually add complexity as you learn the patterns that work best for your specific use case. Remember to always test thoroughly and monitor your LLM usage to ensure optimal performance and cost-effectiveness.

Large Language Model Integration Guide for Backend Engineers

Large Language Model Integration Guide for Backend Engineers

Understanding LLM APIs

OpenAI API Integration

Anthropic Claude Integration

Prompt Engineering Best Practices

Structured Prompt Templates

Few-Shot Learning Patterns

Backend API Design for LLM Integration

RESTful LLM Service

Streaming Responses

Caching and Optimization

LLM Response Caching

Error Handling and Resilience

Retry Logic with Exponential Backoff

Production Considerations

Cost Management

Monitoring and Analytics

Security Considerations

Input Sanitization

Conclusion

Related Articles

Vector Databases for AI Applications

Integrating AI into Backend Systems