load-testing.md 23 KB

Load Testing

Comprehensive guide to load testing tools, methodology, and CI integration.

k6 (Grafana)

Script Structure

// k6 script: load-test.js
import http from 'k6/http';
import { check, sleep, group } from 'k6';
import { Rate, Trend, Counter } from 'k6/metrics';

// Custom metrics
const errorRate = new Rate('errors');
const responseTime = new Trend('response_time');
const requestCount = new Counter('total_requests');

// Test configuration
export const options = {
  // Scenario-based configuration
  scenarios: {
    // Ramp up and sustain load
    load_test: {
      executor: 'ramping-vus',
      startVUs: 0,
      stages: [
        { duration: '2m', target: 50 },   // Ramp up
        { duration: '5m', target: 50 },   // Sustain
        { duration: '2m', target: 0 },    // Ramp down
      ],
      gracefulRampDown: '30s',
    },
  },

  // Thresholds (pass/fail criteria)
  thresholds: {
    http_req_duration: ['p(95)<500', 'p(99)<1000'],  // ms
    http_req_failed: ['rate<0.01'],                   // <1% error rate
    errors: ['rate<0.05'],                            // Custom metric
  },
};

// Setup: runs once before test
export function setup() {
  const loginRes = http.post('https://api.example.com/login', {
    username: 'testuser',
    password: 'testpass',
  });
  return { token: loginRes.json('token') };
}

// Default function: runs for each VU iteration
export default function (data) {
  group('API endpoints', function () {
    // GET request
    const listRes = http.get('https://api.example.com/items', {
      headers: { Authorization: `Bearer ${data.token}` },
    });

    check(listRes, {
      'status is 200': (r) => r.status === 200,
      'response time < 500ms': (r) => r.timings.duration < 500,
      'has items': (r) => r.json('items').length > 0,
    });

    errorRate.add(listRes.status !== 200);
    responseTime.add(listRes.timings.duration);
    requestCount.add(1);

    // POST request
    const createRes = http.post(
      'https://api.example.com/items',
      JSON.stringify({ name: 'test item', value: Math.random() }),
      {
        headers: {
          'Content-Type': 'application/json',
          Authorization: `Bearer ${data.token}`,
        },
      }
    );

    check(createRes, {
      'created successfully': (r) => r.status === 201,
    });

    sleep(1); // Think time between requests
  });
}

// Teardown: runs once after test
export function teardown(data) {
  http.post('https://api.example.com/cleanup', null, {
    headers: { Authorization: `Bearer ${data.token}` },
  });
}

k6 Executors

Executor selection:
│
├─ shared-iterations
│  └─ Fixed total iterations split across VUs
│     Use: "Run exactly N requests total"
│
├─ per-vu-iterations
│  └─ Each VU runs exactly N iterations
│     Use: "Each user does N actions"
│
├─ constant-vus
│  └─ Fixed number of VUs for a duration
│     Use: "Sustain N concurrent users"
│
├─ ramping-vus
│  └─ VUs ramp up/down in stages
│     Use: Standard load test pattern
│
├─ constant-arrival-rate
│  └─ Fixed request rate regardless of response time
│     Use: "Maintain exactly N RPS" (most realistic)
│
├─ ramping-arrival-rate
│  └─ Request rate ramps up/down
│     Use: "Find breaking point at increasing RPS"
│
└─ externally-controlled
   └─ VUs controlled via k6 REST API
      Use: Dynamic load adjustment during test

k6 CLI Commands

# Run a test
k6 run script.js

# Run with overrides
k6 run --vus 50 --duration 30s script.js
k6 run --env BASE_URL=https://staging.example.com script.js

# Output to various formats
k6 run --out json=results.json script.js
k6 run --out csv=results.csv script.js
k6 run --out influxdb=http://localhost:8086/k6 script.js

# Cloud execution (requires k6 cloud account)
k6 cloud script.js

# Convert HAR to k6 script
k6 convert recording.har -O generated-script.js

# Inspect script options without running
k6 inspect script.js

k6 Browser Testing

import { browser } from 'k6/browser';

export const options = {
  scenarios: {
    browser: {
      executor: 'constant-vus',
      vus: 1,
      duration: '30s',
      options: {
        browser: {
          type: 'chromium',
        },
      },
    },
  },
};

export default async function () {
  const page = await browser.newPage();
  try {
    await page.goto('https://example.com');
    await page.locator('input[name="username"]').fill('testuser');
    await page.locator('input[name="password"]').fill('testpass');
    await page.locator('button[type="submit"]').click();
    await page.waitForNavigation();

    // Measure Web Vitals
    const lcp = await page.evaluate(() => {
      return new Promise((resolve) => {
        new PerformanceObserver((list) => {
          const entries = list.getEntries();
          resolve(entries[entries.length - 1].startTime);
        }).observe({ type: 'largest-contentful-paint', buffered: true });
      });
    });
    console.log(`LCP: ${lcp}ms`);
  } finally {
    await page.close();
  }
}

Artillery

YAML Configuration

# artillery-config.yml
config:
  target: "https://api.example.com"
  phases:
    - duration: 120    # 2 minutes
      arrivalRate: 10  # 10 new users per second
      name: "Warm-up"
    - duration: 300    # 5 minutes
      arrivalRate: 50  # 50 new users per second
      name: "Sustained load"
    - duration: 60
      arrivalRate: 100
      name: "Spike"

  # Plugins
  plugins:
    expect: {}        # Response validation
    metrics-by-endpoint: {} # Per-endpoint metrics

  # Default headers
  defaults:
    headers:
      Content-Type: "application/json"

  # Variables
  variables:
    baseUrl: "https://api.example.com"

  # Connection settings
  http:
    timeout: 10        # seconds
    pool: 100          # connection pool size

scenarios:
  - name: "Browse and purchase"
    weight: 70         # 70% of traffic
    flow:
      - get:
          url: "/products"
          expect:
            - statusCode: 200
            - hasProperty: "items"
          capture:
            - json: "$.items[0].id"
              as: "productId"
      - think: 3       # 3 second pause
      - get:
          url: "/products/{{ productId }}"
          expect:
            - statusCode: 200
      - post:
          url: "/cart"
          json:
            productId: "{{ productId }}"
            quantity: 1
          expect:
            - statusCode: 201

  - name: "Search"
    weight: 30         # 30% of traffic
    flow:
      - get:
          url: "/search?q={{ $randomString() }}"
          expect:
            - statusCode: 200

Artillery CLI

# Run load test
artillery run artillery-config.yml

# Quick test (no config needed)
artillery quick --count 100 --num 10 https://api.example.com

# Generate HTML report
artillery run --output report.json artillery-config.yml
artillery report report.json

# Run with environment-specific config
artillery run -e staging artillery-config.yml

# Run with Playwright (browser scenarios)
artillery run --platform playwright artillery-browser.yml

vegeta (Go)

Attack and Report

# Basic attack
echo "GET http://localhost:8080/" | vegeta attack -duration=30s -rate=50/s | vegeta report

# Multiple endpoints from file
# targets.txt:
# GET http://localhost:8080/api/users
# GET http://localhost:8080/api/products
# POST http://localhost:8080/api/orders
# Content-Type: application/json
# @body.json
vegeta attack -targets=targets.txt -duration=60s -rate=100/s | vegeta report

# Custom headers
echo "GET http://localhost:8080/api/data" | \
  vegeta attack -header "Authorization: Bearer TOKEN" -duration=30s | \
  vegeta report

# Output formats
echo "GET http://localhost:8080/" | vegeta attack -duration=30s | vegeta report -type=text
echo "GET http://localhost:8080/" | vegeta attack -duration=30s | vegeta report -type=json
echo "GET http://localhost:8080/" | vegeta attack -duration=30s | vegeta report -type=hist[0,50ms,100ms,200ms,500ms,1s]

# Generate latency plot (HDR histogram)
echo "GET http://localhost:8080/" | vegeta attack -duration=60s | vegeta plot > plot.html

# Encode results for later analysis
echo "GET http://localhost:8080/" | vegeta attack -duration=60s | vegeta encode > results.json

# Constant rate vs max rate
echo "GET http://localhost:8080/" | vegeta attack -rate=0 -max-workers=100 -duration=30s | vegeta report
# -rate=0 means "as fast as possible" with max-workers limit

vegeta Report Interpretation

Requests      [total, rate, throughput]  3000, 100.03, 99.87
Duration      [total, attack, wait]     30.04s, 29.99s, 49.54ms
Latencies     [min, mean, 50, 90, 95, 99, max]  12.5ms, 48.2ms, 42.1ms, 85.3ms, 120.5ms, 250.1ms, 1.2s
Bytes In      [total, mean]             1500000, 500.00
Bytes Out     [total, mean]             0, 0.00
Success       [ratio]                   99.5%
Status Codes  [code:count]              200:2985  500:15

Key metrics:
- p50 (median): typical user experience
- p95: 95% of users experience this or better
- p99: tail latency (worst 1%)
- Success ratio: anything below 99% needs investigation
- Throughput vs rate: throughput < rate means server can't keep up

wrk / wrk2

wrk: Lightweight HTTP Benchmarking

# Basic usage
wrk -t4 -c100 -d30s http://localhost:8080/
# -t4: 4 threads
# -c100: 100 connections
# -d30s: 30 second duration

# With Lua script
wrk -t4 -c100 -d30s -s script.lua http://localhost:8080/

# wrk2 (constant throughput mode)
wrk2 -t4 -c100 -d30s -R2000 http://localhost:8080/
# -R2000: target 2000 requests/second

wrk Lua Scripts

-- post-request.lua: POST with JSON body
wrk.method = "POST"
wrk.body   = '{"username":"test","password":"test"}'
wrk.headers["Content-Type"] = "application/json"

-- dynamic-request.lua: different paths per request
counter = 0
request = function()
  counter = counter + 1
  local path = "/api/items/" .. (counter % 1000)
  return wrk.format("GET", path)
end

-- response.lua: validate responses
response = function(status, headers, body)
  if status ~= 200 then
    wrk.thread:stop()
  end
end

-- report.lua: custom reporting
done = function(summary, latency, requests)
  io.write("Latency distribution:\n")
  for _, p in pairs({ 50, 90, 95, 99, 99.9 }) do
    n = latency:percentile(p)
    io.write(string.format("%g%%\t%d ms\n", p, n / 1000))
  end
end

Locust (Python)

User Classes and Tasks

# locustfile.py
from locust import HttpUser, task, between, events
from locust import LoadTestShape
import json

class WebsiteUser(HttpUser):
    # Wait between requests (simulates think time)
    wait_time = between(1, 5)

    # Run once per user on start
    def on_start(self):
        response = self.client.post("/login", json={
            "username": "testuser",
            "password": "testpass"
        })
        self.token = response.json()["token"]
        self.client.headers.update({
            "Authorization": f"Bearer {self.token}"
        })

    @task(3)  # Weight: 3x more likely than weight-1 tasks
    def browse_items(self):
        with self.client.get("/api/items", catch_response=True) as response:
            if response.status_code == 200:
                items = response.json()["items"]
                if len(items) == 0:
                    response.failure("No items returned")
            else:
                response.failure(f"Status {response.status_code}")

    @task(1)
    def create_item(self):
        self.client.post("/api/items", json={
            "name": f"item-{self.environment.runner.user_count}",
            "value": 42
        })

    @task(2)
    def search(self):
        self.client.get("/api/search?q=test")

    def on_stop(self):
        self.client.post("/logout")


class AdminUser(HttpUser):
    """Separate user class with different behavior"""
    wait_time = between(5, 15)
    weight = 1  # 1 admin for every 10 regular users (if WebsiteUser weight=10)

    @task
    def check_dashboard(self):
        self.client.get("/admin/dashboard")


# Custom load shape
class StagesShape(LoadTestShape):
    """Ramp up, sustain, spike, recover"""
    stages = [
        {"duration": 60,  "users": 10,  "spawn_rate": 2},
        {"duration": 300, "users": 50,  "spawn_rate": 5},
        {"duration": 360, "users": 200, "spawn_rate": 50},  # Spike
        {"duration": 420, "users": 50,  "spawn_rate": 10},  # Recover
        {"duration": 480, "users": 0,   "spawn_rate": 10},  # Ramp down
    ]

    def tick(self):
        run_time = self.get_run_time()
        for stage in self.stages:
            if run_time < stage["duration"]:
                return (stage["users"], stage["spawn_rate"])
        return None

Locust CLI

# Run with web UI (default port 8089)
locust -f locustfile.py --host https://api.example.com

# Headless mode
locust -f locustfile.py --host https://api.example.com \
  --headless -u 100 -r 10 --run-time 5m
# -u: total users, -r: spawn rate per second

# Distributed mode
# Master:
locust -f locustfile.py --master
# Workers (on each worker machine):
locust -f locustfile.py --worker --master-host=MASTER_IP

# CSV output
locust -f locustfile.py --headless -u 50 -r 5 --run-time 5m \
  --csv=results --csv-full-history

# HTML report
locust -f locustfile.py --headless -u 50 -r 5 --run-time 5m \
  --html=report.html

autocannon (Node.js)

CLI and Programmatic Usage

# Basic usage
autocannon -c 100 -d 30 http://localhost:3000
# -c: connections, -d: duration in seconds

# With pipelining (multiple requests per connection)
autocannon -c 100 -p 10 -d 30 http://localhost:3000

# POST with body
autocannon -c 50 -d 30 -m POST \
  -H "Content-Type=application/json" \
  -b '{"key":"value"}' \
  http://localhost:3000/api/data

# HAR file input
autocannon -c 100 -d 30 --har requests.har http://localhost:3000
// Programmatic usage
const autocannon = require('autocannon');

const result = await autocannon({
  url: 'http://localhost:3000',
  connections: 100,
  duration: 30,
  pipelining: 10,
  headers: {
    'Authorization': 'Bearer TOKEN',
  },
  requests: [
    { method: 'GET', path: '/api/items' },
    { method: 'POST', path: '/api/items', body: JSON.stringify({ name: 'test' }) },
  ],
});

console.log('Avg latency:', result.latency.average, 'ms');
console.log('Req/sec:', result.requests.average);
console.log('Throughput:', result.throughput.average, 'bytes/sec');

Load Testing Methodology

Test Planning

Before running load tests:
│
├─ Define objectives
│  ├─ What SLOs must be met? (p95 < 200ms, 99.9% availability)
│  ├─ What is expected peak traffic? (from analytics/projections)
│  └─ What scenarios matter? (browse, search, checkout, API calls)
│
├─ Prepare environment
│  ├─ Use production-like infrastructure (same specs, same config)
│  ├─ Use realistic data volumes (not empty database)
│  ├─ Isolate from production traffic
│  └─ Ensure monitoring is in place (APM, metrics, logs)
│
├─ Create realistic scenarios
│  ├─ Model real user behavior (browse → search → add to cart → checkout)
│  ├─ Include think time between actions
│  ├─ Mix of read and write operations
│  ├─ Vary request payloads
│  └─ Include authentication flows
│
└─ Establish baselines
   ├─ Run smoke test first (verify test works at low load)
   ├─ Record baseline metrics at known-good load
   └─ Compare subsequent tests against baseline

Test Execution Patterns

Ramp-Up Test:
Users ▲
  100 │          ┌──────────────────┐
      │        ╱│                  │╲
   50 │      ╱  │     Sustain      │  ╲
      │    ╱    │                  │    ╲
    0 │──╱─────┼──────────────────┼─────╲──
      └────────────────────────────────────→ Time
      0    2m       5m             7m   9m

Spike Test:
Users ▲
  500 │         ╱╲
      │        ╱  ╲
  100 │───────╱    ╲───────────
      │
    0 │─────────────────────────→ Time

Soak Test:
Users ▲
  100 │  ┌──────────────────────────────┐
      │  │          4-12 hours          │
    0 │──┘                              └──
      └────────────────────────────────────→ Time

Breakpoint Test:
Users ▲
  ??? │                              ╱ ← System breaks here
      │                           ╱
      │                        ╱
      │                     ╱
      │                  ╱
    0 │───────────────╱───────────────→ Time
      Continuously increasing until failure

Results Interpretation

Key metrics to analyze:
│
├─ Latency
│  ├─ p50 (median): typical user experience
│  ├─ p95: most users' worst experience
│  ├─ p99: tail latency (1 in 100 requests)
│  ├─ p99.9: extreme tail (important at scale)
│  └─ Compare: p99/p50 ratio > 10x suggests systemic issue
│
├─ Throughput
│  ├─ Requests per second (RPS)
│  ├─ Compare achieved vs target rate
│  ├─ If achieved < target: server saturated
│  └─ Watch for throughput plateau (max capacity reached)
│
├─ Error Rate
│  ├─ HTTP 5xx errors: server failures
│  ├─ HTTP 429 errors: rate limiting
│  ├─ Timeouts: resource exhaustion
│  ├─ Connection refused: port/socket exhaustion
│  └─ Target: <0.1% under normal load
│
├─ Resource Utilization
│  ├─ CPU: >80% sustained = at capacity
│  ├─ Memory: growing = leak, high = needs more RAM
│  ├─ Disk I/O: iowait >20% = I/O bottleneck
│  ├─ Network: check bandwidth, connection count
│  └─ Connection pools: active/waiting/idle ratios
│
└─ Saturation Point
   ├─ Where latency starts increasing non-linearly
   ├─ Where error rate begins climbing
   ├─ Where throughput plateaus despite more load
   └─ This is your system's practical capacity

Common Findings and Fixes

Finding Symptom Root Cause Fix
Latency spike at load p99 jumps at N users Connection pool exhaustion Increase pool size, optimize queries
Throughput plateau RPS flat despite more VUs CPU saturation Optimize hot paths, scale horizontally
Error rate climbs gradually 5xx increases with load Memory leak under load Fix leak, increase memory, add limits
Timeout cascade Many timeouts after first No circuit breaker Add circuit breaker, retry with backoff
Uneven distribution Some pods idle, some overloaded Bad load balancing Fix health checks, use least-connections
GC pauses Periodic latency spikes Large heap, GC pressure Reduce allocations, tune GC, smaller heap
DNS resolution Intermittent slow requests DNS lookup on every request Connection pooling, DNS caching
TLS handshake overhead High latency on first request No connection reuse Keep-alive, connection pooling

CI Integration

Performance Budgets

# k6 thresholds as CI gates
export const options = {
  thresholds: {
    http_req_duration: [
      { threshold: 'p(95)<500', abortOnFail: true },
      { threshold: 'p(99)<1500', abortOnFail: true },
    ],
    http_req_failed: [
      { threshold: 'rate<0.01', abortOnFail: true },
    ],
    checks: [
      { threshold: 'rate>0.99', abortOnFail: true },
    ],
  },
};

GitHub Actions Example

# .github/workflows/load-test.yml
name: Load Test
on:
  pull_request:
    paths: ['src/**', 'package.json']

jobs:
  load-test:
    runs-on: ubuntu-latest
    services:
      app:
        image: myapp:${{ github.sha }}
        ports:
          - 8080:8080
    steps:
      - uses: actions/checkout@v4

      - name: Install k6
        run: |
          sudo gpg -k
          sudo gpg --no-default-keyring --keyring /usr/share/keyrings/k6-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C5AD17C747E3415A3642D57D77C6C491D6AC1D68
          echo "deb [signed-by=/usr/share/keyrings/k6-archive-keyring.gpg] https://dl.k6.io/deb stable main" | sudo tee /etc/apt/sources.list.d/k6.list
          sudo apt-get update
          sudo apt-get install k6

      - name: Run load test
        run: k6 run --out json=results.json tests/load/api-test.js

      - name: Compare with baseline
        run: |
          # Extract p95 from results
          P95=$(jq -r '.data.metrics.http_req_duration.values["p(95)"]' results.json)
          BASELINE=450  # ms
          if (( $(echo "$P95 > $BASELINE" | bc -l) )); then
            echo "::error::p95 latency regression: ${P95}ms > ${BASELINE}ms baseline"
            exit 1
          fi

      - name: Upload results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: load-test-results
          path: results.json

Baseline Comparison Strategy

Performance regression detection:
│
├─ Establish baseline
│  ├─ Run load test on main branch after each merge
│  ├─ Store results in a time-series DB or artifact storage
│  └─ Track p50, p95, p99, throughput, error rate
│
├─ PR comparison
│  ├─ Run same test on PR branch
│  ├─ Compare against baseline
│  ├─ Alert if metrics degrade beyond threshold
│  └─ Common thresholds: >10% p95 increase, >5% throughput decrease
│
├─ Statistical significance
│  ├─ Run test multiple times (3-5x) to account for noise
│  ├─ Use statistical tests (t-test) to confirm regression
│  └─ Avoid false positives from system noise
│
└─ Trend tracking
   ├─ Plot metrics over time across releases
   ├─ Catch gradual degradation that per-PR tests miss
   └─ Set alerts for multi-week trends

Test Data Management

Realistic test data:
│
├─ Data volume
│  ├─ Match production data volume (or representative subset)
│  ├─ Empty DB gives misleadingly good results
│  └─ Index effectiveness depends on data distribution
│
├─ Data variety
│  ├─ Use parameterized inputs (not same request every time)
│  ├─ Vary payload sizes
│  ├─ Include edge cases (long strings, Unicode, special chars)
│  └─ Distribute IDs to avoid cache hot-spotting
│
├─ Data isolation
│  ├─ Each test run should use clean or isolated data
│  ├─ Tests that modify data should not affect next run
│  ├─ Use database transactions/rollback or test-specific namespaces
│  └─ Avoid accumulating data across test runs
│
└─ Data generation
   ├─ k6: use SharedArray for CSV/JSON data files
   ├─ Artillery: use CSV feeders, custom functions
   ├─ Locust: use Python libraries (Faker) for realistic data
   └─ General: pre-generate data, load before test