Backend Performance Monitoring & Optimization
Backend Performance Monitoring & Optimization
Performance monitoring is crucial for maintaining scalable, responsive backend applications. This guide covers essential monitoring techniques, tools, and optimization strategies.
Performance Metrics to Track
Key Performance Indicators (KPIs)
// Core performance metrics const performanceMetrics = { // Response time metrics responseTime: { p50: 0, // 50th percentile p95: 0, // 95th percentile p99: 0, // 99th percentile max: 0 // Maximum response time }, // Throughput metrics throughput: { requestsPerSecond: 0, requestsPerMinute: 0, concurrentUsers: 0 }, // Error metrics errors: { errorRate: 0, // Percentage of failed requests errorCount: 0, // Total error count errorTypes: {} // Breakdown by error type }, // Resource utilization resources: { cpuUsage: 0, // CPU utilization percentage memoryUsage: 0, // Memory usage in MB diskUsage: 0, // Disk usage percentage networkIO: 0 // Network I/O in bytes } };
Java/Spring Boot:
@Component public class PerformanceMetrics { private final MeterRegistry meterRegistry; private final Timer.Sample sample; public PerformanceMetrics(MeterRegistry meterRegistry) { this.meterRegistry = meterRegistry; this.sample = Timer.start(meterRegistry); } // Core performance metrics using Micrometer public void recordResponseTime(String operation, long durationMs) { Timer.builder("operation.duration") .tag("operation", operation) .register(meterRegistry) .record(durationMs, TimeUnit.MILLISECONDS); } public void recordRequestCount(String endpoint, String method, int statusCode) { Counter.builder("http.requests") .tag("endpoint", endpoint) .tag("method", method) .tag("status", String.valueOf(statusCode)) .register(meterRegistry) .increment(); } public void recordErrorCount(String operation, String errorType) { Counter.builder("errors.total") .tag("operation", operation) .tag("type", errorType) .register(meterRegistry) .increment(); } // Custom gauge for memory usage @EventListener public void recordMemoryUsage() { MemoryMXBean memoryBean = ManagementFactory.getMemoryMXBean(); MemoryUsage heapUsage = memoryBean.getHeapMemoryUsage(); Gauge.builder("jvm.memory.used") .register(meterRegistry, heapUsage, MemoryUsage::getUsed); Gauge.builder("jvm.memory.max") .register(meterRegistry, heapUsage, MemoryUsage::getMax); } // Database query performance public void recordDatabaseQuery(String query, long durationMs) { Timer.builder("database.query.duration") .tag("query", query) .register(meterRegistry) .record(durationMs, TimeUnit.MILLISECONDS); } } // Performance monitoring aspect @Aspect @Component public class PerformanceMonitoringAspect { private final PerformanceMetrics performanceMetrics; private final MeterRegistry meterRegistry; public PerformanceMonitoringAspect(PerformanceMetrics performanceMetrics, MeterRegistry meterRegistry) { this.performanceMetrics = performanceMetrics; this.meterRegistry = meterRegistry; } @Around("@annotation(Monitored)") public Object monitorMethod(ProceedingJoinPoint joinPoint) throws Throwable { String methodName = joinPoint.getSignature().getName(); Timer.Sample sample = Timer.start(meterRegistry); try { Object result = joinPoint.proceed(); return result; } catch (Exception e) { performanceMetrics.recordErrorCount(methodName, e.getClass().getSimpleName()); throw e; } finally { sample.stop(Timer.builder("method.duration") .tag("method", methodName) .register(meterRegistry)); } } } // Custom annotation for monitoring @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Monitored { String value() default ""; }
Custom Performance Monitoring
class PerformanceMonitor { constructor() { this.metrics = new Map(); this.startTimes = new Map(); } // Start timing an operation startTimer(operationId) { this.startTimes.set(operationId, process.hrtime.bigint()); } // End timing and record metric endTimer(operationId, tags = {}) { const startTime = this.startTimes.get(operationId); if (!startTime) return; const endTime = process.hrtime.bigint(); const duration = Number(endTime - startTime) / 1000000; // Convert to milliseconds this.recordMetric('operation_duration', duration, { operation: operationId, ...tags }); this.startTimes.delete(operationId); } // Record a metric value recordMetric(name, value, tags = {}) { const key = `${name}_${JSON.stringify(tags)}`; if (!this.metrics.has(key)) { this.metrics.set(key, { name, tags, values: [], count: 0, sum: 0, min: Infinity, max: -Infinity }); } const metric = this.metrics.get(key); metric.values.push(value); metric.count++; metric.sum += value; metric.min = Math.min(metric.min, value); metric.max = Math.max(metric.max, value); // Keep only last 1000 values to prevent memory leaks if (metric.values.length > 1000) { metric.values = metric.values.slice(-1000); } } // Get percentile value getPercentile(values, percentile) { const sorted = [...values].sort((a, b) => a - b); const index = Math.ceil((percentile / 100) * sorted.length) - 1; return sorted[index] || 0; } // Get metrics summary getMetricsSummary() { const summary = {}; for (const [key, metric] of this.metrics) { if (metric.count === 0) continue; summary[metric.name] = { count: metric.count, sum: metric.sum, avg: metric.sum / metric.count, min: metric.min, max: metric.max, p50: this.getPercentile(metric.values, 50), p95: this.getPercentile(metric.values, 95), p99: this.getPercentile(metric.values, 99), tags: metric.tags }; } return summary; } } // Global performance monitor instance const perfMonitor = new PerformanceMonitor();
Application Performance Monitoring (APM)
Express.js Middleware for Request Tracking
const express = require('express'); const app = express(); // Request timing middleware app.use((req, res, next) => { const requestId = `${req.method}_${req.path}_${Date.now()}`; req.requestId = requestId; perfMonitor.startTimer(requestId); // Override res.end to capture response time const originalEnd = res.end; res.end = function(...args) { perfMonitor.endTimer(requestId, { method: req.method, path: req.path, statusCode: res.statusCode }); originalEnd.apply(this, args); }; next(); }); // Database query monitoring function monitorDatabaseQuery(query, params) { const queryId = `db_query_${Date.now()}`; perfMonitor.startTimer(queryId); return query.then(result => { perfMonitor.endTimer(queryId, { type: 'database', query: query.sql || 'unknown' }); return result; }).catch(error => { perfMonitor.endTimer(queryId, { type: 'database', query: query.sql || 'unknown', error: true }); throw error; }); } // Usage example app.get('/api/users', async (req, res) => { try { const users = await monitorDatabaseQuery( db.query('SELECT * FROM users') ); res.json(users); } catch (error) { res.status(500).json({ error: 'Internal server error' }); } });
Memory and CPU Monitoring
const os = require('os'); const process = require('process'); class SystemMonitor { constructor() { this.startTime = Date.now(); this.startCpuUsage = process.cpuUsage(); } getSystemMetrics() { const memUsage = process.memoryUsage(); const cpuUsage = process.cpuUsage(this.startCpuUsage); return { // Memory metrics memory: { rss: Math.round(memUsage.rss / 1024 / 1024), // MB heapTotal: Math.round(memUsage.heapTotal / 1024 / 1024), heapUsed: Math.round(memUsage.heapUsed / 1024 / 1024), external: Math.round(memUsage.external / 1024 / 1024), systemTotal: Math.round(os.totalmem() / 1024 / 1024), systemFree: Math.round(os.freemem() / 1024 / 1024) }, // CPU metrics cpu: { user: cpuUsage.user / 1000000, // Convert to seconds system: cpuUsage.system / 1000000, cores: os.cpus().length, loadAverage: os.loadavg() }, // Process metrics process: { uptime: Math.round((Date.now() - this.startTime) / 1000), pid: process.pid, version: process.version, platform: process.platform } }; } // Check if system is under stress isSystemStressed() { const metrics = this.getSystemMetrics(); const loadAvg = metrics.cpu.loadAverage[0]; const memoryUsagePercent = (metrics.memory.heapUsed / metrics.memory.heapTotal) * 100; return { highLoad: loadAvg > os.cpus().length * 0.8, highMemory: memoryUsagePercent > 80, lowMemory: metrics.memory.systemFree < 100 // Less than 100MB free }; } } const systemMonitor = new SystemMonitor(); // Periodic system monitoring setInterval(() => { const metrics = systemMonitor.getSystemMetrics(); const stress = systemMonitor.isSystemStressed(); // Log metrics every 30 seconds console.log('System Metrics:', { memory: metrics.memory, cpu: metrics.cpu, stress }); // Alert on high stress if (stress.highLoad || stress.highMemory || stress.lowMemory) { console.warn('System under stress:', stress); } }, 30000);
Database Performance Monitoring
Query Performance Tracking
const { Pool } = require('pg'); class MonitoredPool extends Pool { constructor(config) { super(config); this.queryCount = 0; this.slowQueries = []; this.queryTimes = []; } async query(text, params) { const startTime = process.hrtime.bigint(); this.queryCount++; try { const result = await super.query(text, params); const endTime = process.hrtime.bigint(); const duration = Number(endTime - startTime) / 1000000; // Convert to ms // Record query performance this.queryTimes.push(duration); // Track slow queries if (duration > 1000) { // Queries slower than 1 second this.slowQueries.push({ query: text, params, duration, timestamp: new Date() }); console.warn(`Slow query detected: ${duration}ms`, { query: text.substring(0, 100) + '...', params }); } return result; } catch (error) { const endTime = process.hrtime.bigint(); const duration = Number(endTime - startTime) / 1000000; console.error('Database query error:', { query: text, params, duration, error: error.message }); throw error; } } getQueryStats() { const avgTime = this.queryTimes.length > 0 ? this.queryTimes.reduce((a, b) => a + b, 0) / this.queryTimes.length : 0; return { totalQueries: this.queryCount, averageQueryTime: Math.round(avgTime), slowQueries: this.slowQueries.length, recentSlowQueries: this.slowQueries.slice(-10) }; } } // Use monitored pool const pool = new MonitoredPool({ connectionString: process.env.DATABASE_URL }); // Log query stats every minute setInterval(() => { const stats = pool.getQueryStats(); console.log('Database Performance:', stats); }, 60000);
Real-time Monitoring Dashboard
Express.js Health Check Endpoint
app.get('/health', (req, res) => { const systemMetrics = systemMonitor.getSystemMetrics(); const performanceMetrics = perfMonitor.getMetricsSummary(); const dbStats = pool.getQueryStats(); const stress = systemMonitor.isSystemStressed(); const health = { status: stress.highLoad || stress.highMemory ? 'degraded' : 'healthy', timestamp: new Date().toISOString(), uptime: process.uptime(), system: systemMetrics, performance: performanceMetrics, database: dbStats, alerts: { highLoad: stress.highLoad, highMemory: stress.highMemory, lowMemory: stress.lowMemory } }; const statusCode = health.status === 'healthy' ? 200 : 503; res.status(statusCode).json(health); }); // Detailed metrics endpoint app.get('/metrics', (req, res) => { const metrics = { performance: perfMonitor.getMetricsSummary(), system: systemMonitor.getSystemMetrics(), database: pool.getQueryStats() }; res.json(metrics); });
Performance Optimization Strategies
Caching Implementation
const NodeCache = require('node-cache'); const Redis = require('redis'); class CacheManager { constructor() { this.memoryCache = new NodeCache({ stdTTL: 600 }); // 10 minutes this.redis = Redis.createClient({ host: process.env.REDIS_HOST, port: process.env.REDIS_PORT }); } async get(key) { // Try memory cache first let value = this.memoryCache.get(key); if (value) { perfMonitor.recordMetric('cache_hit', 1, { type: 'memory' }); return value; } // Try Redis cache try { const redisValue = await this.redis.get(key); if (redisValue) { const parsed = JSON.parse(redisValue); this.memoryCache.set(key, parsed); // Populate memory cache perfMonitor.recordMetric('cache_hit', 1, { type: 'redis' }); return parsed; } } catch (error) { console.error('Redis cache error:', error); } perfMonitor.recordMetric('cache_miss', 1, { type: 'all' }); return null; } async set(key, value, ttl = 3600) { // Set in memory cache this.memoryCache.set(key, value); // Set in Redis cache try { await this.redis.setex(key, ttl, JSON.stringify(value)); } catch (error) { console.error('Redis cache set error:', error); } } async del(key) { this.memoryCache.del(key); try { await this.redis.del(key); } catch (error) { console.error('Redis cache delete error:', error); } } } const cache = new CacheManager(); // Cached database query async function getCachedUser(userId) { const cacheKey = `user:${userId}`; let user = await cache.get(cacheKey); if (user) { return user; } // Fetch from database const result = await pool.query('SELECT * FROM users WHERE id = $1', [userId]); user = result.rows[0]; if (user) { await cache.set(cacheKey, user, 1800); // 30 minutes } return user; }
Connection Pooling Optimization
// Optimized database connection pool const optimizedPool = new Pool({ connectionString: process.env.DATABASE_URL, max: 20, // Maximum number of clients in the pool min: 5, // Minimum number of clients in the pool idleTimeoutMillis: 30000, // Close idle clients after 30 seconds connectionTimeoutMillis: 2000, // Return an error after 2 seconds acquireTimeoutMillis: 30000, // Return an error after 30 seconds createTimeoutMillis: 30000, destroyTimeoutMillis: 5000, reapIntervalMillis: 1000, createRetryIntervalMillis: 200, // SSL configuration for production ssl: process.env.NODE_ENV === 'production' ? { rejectUnauthorized: false } : false }); // Pool monitoring setInterval(() => { const poolStats = { totalCount: optimizedPool.totalCount, idleCount: optimizedPool.idleCount, waitingCount: optimizedPool.waitingCount }; console.log('Pool Stats:', poolStats); // Alert if pool is exhausted if (poolStats.waitingCount > 5) { console.warn('High connection pool wait time!'); } }, 30000);
Alerting and Notifications
Performance Alert System
class AlertManager { constructor() { this.alerts = new Map(); this.thresholds = { responseTime: 2000, // 2 seconds errorRate: 0.05, // 5% memoryUsage: 0.8, // 80% cpuUsage: 0.8 // 80% }; } checkAlerts(metrics) { const alerts = []; // Response time alert if (metrics.performance?.operation_duration?.p95 > this.thresholds.responseTime) { alerts.push({ type: 'HIGH_RESPONSE_TIME', message: `95th percentile response time is ${metrics.performance.operation_duration.p95}ms`, severity: 'warning' }); } // Error rate alert const errorRate = metrics.errors?.errorRate || 0; if (errorRate > this.thresholds.errorRate) { alerts.push({ type: 'HIGH_ERROR_RATE', message: `Error rate is ${(errorRate * 100).toFixed(2)}%`, severity: 'critical' }); } // Memory usage alert const memoryUsage = metrics.system?.memory?.heapUsed / metrics.system?.memory?.heapTotal || 0; if (memoryUsage > this.thresholds.memoryUsage) { alerts.push({ type: 'HIGH_MEMORY_USAGE', message: `Memory usage is ${(memoryUsage * 100).toFixed(2)}%`, severity: 'warning' }); } return alerts; } async sendAlert(alert) { // Send to monitoring service (e.g., DataDog, New Relic, etc.) console.warn('ALERT:', alert); // You can integrate with external services here // await sendToSlack(alert); // await sendToEmail(alert); } } const alertManager = new AlertManager(); // Check alerts every 30 seconds setInterval(() => { const metrics = { performance: perfMonitor.getMetricsSummary(), system: systemMonitor.getSystemMetrics(), errors: { errorRate: 0 } // Calculate from your error tracking }; const alerts = alertManager.checkAlerts(metrics); alerts.forEach(alert => alertManager.sendAlert(alert)); }, 30000);
Conclusion
Effective performance monitoring requires:
- Comprehensive metrics collection - Track all relevant KPIs
- Real-time monitoring - Detect issues as they happen
- Proactive alerting - Get notified before problems escalate
- Performance optimization - Continuously improve based on data
- Regular analysis - Review trends and patterns
Remember: Monitoring without action is just data collection. Use the insights to continuously improve your application's performance and reliability.
Related Articles
Incident Playbook for Beginners: Real-World Monitoring and Troubleshooting Stories
A story-driven, plain English incident playbook for new backend & SRE engineers. Find, fix, and prevent outages with empathy and practical steps.
System Design Power-Guide 2025: What To Learn, In What Order, With Real-World Links
Stop bookmarking random threads. This is a tight, no-fluff map of what to study for system design in 2025 - what each topic is, why it matters in interviews and production, and where to go deeper.
DSA Patterns Master Guide: How To Identify Problems, Pick Patterns, and Practice (With LeetCode Sets)
A practical, pattern-first road map for entry-level engineers. Learn how to identify the right pattern quickly, apply a small algorithm template, know variants and pitfalls, and practice with curated LeetCode problems.