Comprehensive guide to load testing tools, methodology, and CI integration.
// k6 script: load-test.js
import http from 'k6/http';
import { check, sleep, group } from 'k6';
import { Rate, Trend, Counter } from 'k6/metrics';
// Custom metrics
const errorRate = new Rate('errors');
const responseTime = new Trend('response_time');
const requestCount = new Counter('total_requests');
// Test configuration
export const options = {
// Scenario-based configuration
scenarios: {
// Ramp up and sustain load
load_test: {
executor: 'ramping-vus',
startVUs: 0,
stages: [
{ duration: '2m', target: 50 }, // Ramp up
{ duration: '5m', target: 50 }, // Sustain
{ duration: '2m', target: 0 }, // Ramp down
],
gracefulRampDown: '30s',
},
},
// Thresholds (pass/fail criteria)
thresholds: {
http_req_duration: ['p(95)<500', 'p(99)<1000'], // ms
http_req_failed: ['rate<0.01'], // <1% error rate
errors: ['rate<0.05'], // Custom metric
},
};
// Setup: runs once before test
export function setup() {
const loginRes = http.post('https://api.example.com/login', {
username: 'testuser',
password: 'testpass',
});
return { token: loginRes.json('token') };
}
// Default function: runs for each VU iteration
export default function (data) {
group('API endpoints', function () {
// GET request
const listRes = http.get('https://api.example.com/items', {
headers: { Authorization: `Bearer ${data.token}` },
});
check(listRes, {
'status is 200': (r) => r.status === 200,
'response time < 500ms': (r) => r.timings.duration < 500,
'has items': (r) => r.json('items').length > 0,
});
errorRate.add(listRes.status !== 200);
responseTime.add(listRes.timings.duration);
requestCount.add(1);
// POST request
const createRes = http.post(
'https://api.example.com/items',
JSON.stringify({ name: 'test item', value: Math.random() }),
{
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${data.token}`,
},
}
);
check(createRes, {
'created successfully': (r) => r.status === 201,
});
sleep(1); // Think time between requests
});
}
// Teardown: runs once after test
export function teardown(data) {
http.post('https://api.example.com/cleanup', null, {
headers: { Authorization: `Bearer ${data.token}` },
});
}
Executor selection:
│
├─ shared-iterations
│ └─ Fixed total iterations split across VUs
│ Use: "Run exactly N requests total"
│
├─ per-vu-iterations
│ └─ Each VU runs exactly N iterations
│ Use: "Each user does N actions"
│
├─ constant-vus
│ └─ Fixed number of VUs for a duration
│ Use: "Sustain N concurrent users"
│
├─ ramping-vus
│ └─ VUs ramp up/down in stages
│ Use: Standard load test pattern
│
├─ constant-arrival-rate
│ └─ Fixed request rate regardless of response time
│ Use: "Maintain exactly N RPS" (most realistic)
│
├─ ramping-arrival-rate
│ └─ Request rate ramps up/down
│ Use: "Find breaking point at increasing RPS"
│
└─ externally-controlled
└─ VUs controlled via k6 REST API
Use: Dynamic load adjustment during test
# Run a test
k6 run script.js
# Run with overrides
k6 run --vus 50 --duration 30s script.js
k6 run --env BASE_URL=https://staging.example.com script.js
# Output to various formats
k6 run --out json=results.json script.js
k6 run --out csv=results.csv script.js
k6 run --out influxdb=http://localhost:8086/k6 script.js
# Cloud execution (requires k6 cloud account)
k6 cloud script.js
# Convert HAR to k6 script
k6 convert recording.har -O generated-script.js
# Inspect script options without running
k6 inspect script.js
import { browser } from 'k6/browser';
export const options = {
scenarios: {
browser: {
executor: 'constant-vus',
vus: 1,
duration: '30s',
options: {
browser: {
type: 'chromium',
},
},
},
},
};
export default async function () {
const page = await browser.newPage();
try {
await page.goto('https://example.com');
await page.locator('input[name="username"]').fill('testuser');
await page.locator('input[name="password"]').fill('testpass');
await page.locator('button[type="submit"]').click();
await page.waitForNavigation();
// Measure Web Vitals
const lcp = await page.evaluate(() => {
return new Promise((resolve) => {
new PerformanceObserver((list) => {
const entries = list.getEntries();
resolve(entries[entries.length - 1].startTime);
}).observe({ type: 'largest-contentful-paint', buffered: true });
});
});
console.log(`LCP: ${lcp}ms`);
} finally {
await page.close();
}
}
# artillery-config.yml
config:
target: "https://api.example.com"
phases:
- duration: 120 # 2 minutes
arrivalRate: 10 # 10 new users per second
name: "Warm-up"
- duration: 300 # 5 minutes
arrivalRate: 50 # 50 new users per second
name: "Sustained load"
- duration: 60
arrivalRate: 100
name: "Spike"
# Plugins
plugins:
expect: {} # Response validation
metrics-by-endpoint: {} # Per-endpoint metrics
# Default headers
defaults:
headers:
Content-Type: "application/json"
# Variables
variables:
baseUrl: "https://api.example.com"
# Connection settings
http:
timeout: 10 # seconds
pool: 100 # connection pool size
scenarios:
- name: "Browse and purchase"
weight: 70 # 70% of traffic
flow:
- get:
url: "/products"
expect:
- statusCode: 200
- hasProperty: "items"
capture:
- json: "$.items[0].id"
as: "productId"
- think: 3 # 3 second pause
- get:
url: "/products/{{ productId }}"
expect:
- statusCode: 200
- post:
url: "/cart"
json:
productId: "{{ productId }}"
quantity: 1
expect:
- statusCode: 201
- name: "Search"
weight: 30 # 30% of traffic
flow:
- get:
url: "/search?q={{ $randomString() }}"
expect:
- statusCode: 200
# Run load test
artillery run artillery-config.yml
# Quick test (no config needed)
artillery quick --count 100 --num 10 https://api.example.com
# Generate HTML report
artillery run --output report.json artillery-config.yml
artillery report report.json
# Run with environment-specific config
artillery run -e staging artillery-config.yml
# Run with Playwright (browser scenarios)
artillery run --platform playwright artillery-browser.yml
# Basic attack
echo "GET http://localhost:8080/" | vegeta attack -duration=30s -rate=50/s | vegeta report
# Multiple endpoints from file
# targets.txt:
# GET http://localhost:8080/api/users
# GET http://localhost:8080/api/products
# POST http://localhost:8080/api/orders
# Content-Type: application/json
# @body.json
vegeta attack -targets=targets.txt -duration=60s -rate=100/s | vegeta report
# Custom headers
echo "GET http://localhost:8080/api/data" | \
vegeta attack -header "Authorization: Bearer TOKEN" -duration=30s | \
vegeta report
# Output formats
echo "GET http://localhost:8080/" | vegeta attack -duration=30s | vegeta report -type=text
echo "GET http://localhost:8080/" | vegeta attack -duration=30s | vegeta report -type=json
echo "GET http://localhost:8080/" | vegeta attack -duration=30s | vegeta report -type=hist[0,50ms,100ms,200ms,500ms,1s]
# Generate latency plot (HDR histogram)
echo "GET http://localhost:8080/" | vegeta attack -duration=60s | vegeta plot > plot.html
# Encode results for later analysis
echo "GET http://localhost:8080/" | vegeta attack -duration=60s | vegeta encode > results.json
# Constant rate vs max rate
echo "GET http://localhost:8080/" | vegeta attack -rate=0 -max-workers=100 -duration=30s | vegeta report
# -rate=0 means "as fast as possible" with max-workers limit
Requests [total, rate, throughput] 3000, 100.03, 99.87
Duration [total, attack, wait] 30.04s, 29.99s, 49.54ms
Latencies [min, mean, 50, 90, 95, 99, max] 12.5ms, 48.2ms, 42.1ms, 85.3ms, 120.5ms, 250.1ms, 1.2s
Bytes In [total, mean] 1500000, 500.00
Bytes Out [total, mean] 0, 0.00
Success [ratio] 99.5%
Status Codes [code:count] 200:2985 500:15
Key metrics:
- p50 (median): typical user experience
- p95: 95% of users experience this or better
- p99: tail latency (worst 1%)
- Success ratio: anything below 99% needs investigation
- Throughput vs rate: throughput < rate means server can't keep up
# Basic usage
wrk -t4 -c100 -d30s http://localhost:8080/
# -t4: 4 threads
# -c100: 100 connections
# -d30s: 30 second duration
# With Lua script
wrk -t4 -c100 -d30s -s script.lua http://localhost:8080/
# wrk2 (constant throughput mode)
wrk2 -t4 -c100 -d30s -R2000 http://localhost:8080/
# -R2000: target 2000 requests/second
-- post-request.lua: POST with JSON body
wrk.method = "POST"
wrk.body = '{"username":"test","password":"test"}'
wrk.headers["Content-Type"] = "application/json"
-- dynamic-request.lua: different paths per request
counter = 0
request = function()
counter = counter + 1
local path = "/api/items/" .. (counter % 1000)
return wrk.format("GET", path)
end
-- response.lua: validate responses
response = function(status, headers, body)
if status ~= 200 then
wrk.thread:stop()
end
end
-- report.lua: custom reporting
done = function(summary, latency, requests)
io.write("Latency distribution:\n")
for _, p in pairs({ 50, 90, 95, 99, 99.9 }) do
n = latency:percentile(p)
io.write(string.format("%g%%\t%d ms\n", p, n / 1000))
end
end
# locustfile.py
from locust import HttpUser, task, between, events
from locust import LoadTestShape
import json
class WebsiteUser(HttpUser):
# Wait between requests (simulates think time)
wait_time = between(1, 5)
# Run once per user on start
def on_start(self):
response = self.client.post("/login", json={
"username": "testuser",
"password": "testpass"
})
self.token = response.json()["token"]
self.client.headers.update({
"Authorization": f"Bearer {self.token}"
})
@task(3) # Weight: 3x more likely than weight-1 tasks
def browse_items(self):
with self.client.get("/api/items", catch_response=True) as response:
if response.status_code == 200:
items = response.json()["items"]
if len(items) == 0:
response.failure("No items returned")
else:
response.failure(f"Status {response.status_code}")
@task(1)
def create_item(self):
self.client.post("/api/items", json={
"name": f"item-{self.environment.runner.user_count}",
"value": 42
})
@task(2)
def search(self):
self.client.get("/api/search?q=test")
def on_stop(self):
self.client.post("/logout")
class AdminUser(HttpUser):
"""Separate user class with different behavior"""
wait_time = between(5, 15)
weight = 1 # 1 admin for every 10 regular users (if WebsiteUser weight=10)
@task
def check_dashboard(self):
self.client.get("/admin/dashboard")
# Custom load shape
class StagesShape(LoadTestShape):
"""Ramp up, sustain, spike, recover"""
stages = [
{"duration": 60, "users": 10, "spawn_rate": 2},
{"duration": 300, "users": 50, "spawn_rate": 5},
{"duration": 360, "users": 200, "spawn_rate": 50}, # Spike
{"duration": 420, "users": 50, "spawn_rate": 10}, # Recover
{"duration": 480, "users": 0, "spawn_rate": 10}, # Ramp down
]
def tick(self):
run_time = self.get_run_time()
for stage in self.stages:
if run_time < stage["duration"]:
return (stage["users"], stage["spawn_rate"])
return None
# Run with web UI (default port 8089)
locust -f locustfile.py --host https://api.example.com
# Headless mode
locust -f locustfile.py --host https://api.example.com \
--headless -u 100 -r 10 --run-time 5m
# -u: total users, -r: spawn rate per second
# Distributed mode
# Master:
locust -f locustfile.py --master
# Workers (on each worker machine):
locust -f locustfile.py --worker --master-host=MASTER_IP
# CSV output
locust -f locustfile.py --headless -u 50 -r 5 --run-time 5m \
--csv=results --csv-full-history
# HTML report
locust -f locustfile.py --headless -u 50 -r 5 --run-time 5m \
--html=report.html
# Basic usage
autocannon -c 100 -d 30 http://localhost:3000
# -c: connections, -d: duration in seconds
# With pipelining (multiple requests per connection)
autocannon -c 100 -p 10 -d 30 http://localhost:3000
# POST with body
autocannon -c 50 -d 30 -m POST \
-H "Content-Type=application/json" \
-b '{"key":"value"}' \
http://localhost:3000/api/data
# HAR file input
autocannon -c 100 -d 30 --har requests.har http://localhost:3000
// Programmatic usage
const autocannon = require('autocannon');
const result = await autocannon({
url: 'http://localhost:3000',
connections: 100,
duration: 30,
pipelining: 10,
headers: {
'Authorization': 'Bearer TOKEN',
},
requests: [
{ method: 'GET', path: '/api/items' },
{ method: 'POST', path: '/api/items', body: JSON.stringify({ name: 'test' }) },
],
});
console.log('Avg latency:', result.latency.average, 'ms');
console.log('Req/sec:', result.requests.average);
console.log('Throughput:', result.throughput.average, 'bytes/sec');
Before running load tests:
│
├─ Define objectives
│ ├─ What SLOs must be met? (p95 < 200ms, 99.9% availability)
│ ├─ What is expected peak traffic? (from analytics/projections)
│ └─ What scenarios matter? (browse, search, checkout, API calls)
│
├─ Prepare environment
│ ├─ Use production-like infrastructure (same specs, same config)
│ ├─ Use realistic data volumes (not empty database)
│ ├─ Isolate from production traffic
│ └─ Ensure monitoring is in place (APM, metrics, logs)
│
├─ Create realistic scenarios
│ ├─ Model real user behavior (browse → search → add to cart → checkout)
│ ├─ Include think time between actions
│ ├─ Mix of read and write operations
│ ├─ Vary request payloads
│ └─ Include authentication flows
│
└─ Establish baselines
├─ Run smoke test first (verify test works at low load)
├─ Record baseline metrics at known-good load
└─ Compare subsequent tests against baseline
Ramp-Up Test:
Users ▲
100 │ ┌──────────────────┐
│ ╱│ │╲
50 │ ╱ │ Sustain │ ╲
│ ╱ │ │ ╲
0 │──╱─────┼──────────────────┼─────╲──
└────────────────────────────────────→ Time
0 2m 5m 7m 9m
Spike Test:
Users ▲
500 │ ╱╲
│ ╱ ╲
100 │───────╱ ╲───────────
│
0 │─────────────────────────→ Time
Soak Test:
Users ▲
100 │ ┌──────────────────────────────┐
│ │ 4-12 hours │
0 │──┘ └──
└────────────────────────────────────→ Time
Breakpoint Test:
Users ▲
??? │ ╱ ← System breaks here
│ ╱
│ ╱
│ ╱
│ ╱
0 │───────────────╱───────────────→ Time
Continuously increasing until failure
Key metrics to analyze:
│
├─ Latency
│ ├─ p50 (median): typical user experience
│ ├─ p95: most users' worst experience
│ ├─ p99: tail latency (1 in 100 requests)
│ ├─ p99.9: extreme tail (important at scale)
│ └─ Compare: p99/p50 ratio > 10x suggests systemic issue
│
├─ Throughput
│ ├─ Requests per second (RPS)
│ ├─ Compare achieved vs target rate
│ ├─ If achieved < target: server saturated
│ └─ Watch for throughput plateau (max capacity reached)
│
├─ Error Rate
│ ├─ HTTP 5xx errors: server failures
│ ├─ HTTP 429 errors: rate limiting
│ ├─ Timeouts: resource exhaustion
│ ├─ Connection refused: port/socket exhaustion
│ └─ Target: <0.1% under normal load
│
├─ Resource Utilization
│ ├─ CPU: >80% sustained = at capacity
│ ├─ Memory: growing = leak, high = needs more RAM
│ ├─ Disk I/O: iowait >20% = I/O bottleneck
│ ├─ Network: check bandwidth, connection count
│ └─ Connection pools: active/waiting/idle ratios
│
└─ Saturation Point
├─ Where latency starts increasing non-linearly
├─ Where error rate begins climbing
├─ Where throughput plateaus despite more load
└─ This is your system's practical capacity
| Finding | Symptom | Root Cause | Fix |
|---|---|---|---|
| Latency spike at load | p99 jumps at N users | Connection pool exhaustion | Increase pool size, optimize queries |
| Throughput plateau | RPS flat despite more VUs | CPU saturation | Optimize hot paths, scale horizontally |
| Error rate climbs gradually | 5xx increases with load | Memory leak under load | Fix leak, increase memory, add limits |
| Timeout cascade | Many timeouts after first | No circuit breaker | Add circuit breaker, retry with backoff |
| Uneven distribution | Some pods idle, some overloaded | Bad load balancing | Fix health checks, use least-connections |
| GC pauses | Periodic latency spikes | Large heap, GC pressure | Reduce allocations, tune GC, smaller heap |
| DNS resolution | Intermittent slow requests | DNS lookup on every request | Connection pooling, DNS caching |
| TLS handshake overhead | High latency on first request | No connection reuse | Keep-alive, connection pooling |
# k6 thresholds as CI gates
export const options = {
thresholds: {
http_req_duration: [
{ threshold: 'p(95)<500', abortOnFail: true },
{ threshold: 'p(99)<1500', abortOnFail: true },
],
http_req_failed: [
{ threshold: 'rate<0.01', abortOnFail: true },
],
checks: [
{ threshold: 'rate>0.99', abortOnFail: true },
],
},
};
# .github/workflows/load-test.yml
name: Load Test
on:
pull_request:
paths: ['src/**', 'package.json']
jobs:
load-test:
runs-on: ubuntu-latest
services:
app:
image: myapp:${{ github.sha }}
ports:
- 8080:8080
steps:
- uses: actions/checkout@v4
- name: Install k6
run: |
sudo gpg -k
sudo gpg --no-default-keyring --keyring /usr/share/keyrings/k6-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C5AD17C747E3415A3642D57D77C6C491D6AC1D68
echo "deb [signed-by=/usr/share/keyrings/k6-archive-keyring.gpg] https://dl.k6.io/deb stable main" | sudo tee /etc/apt/sources.list.d/k6.list
sudo apt-get update
sudo apt-get install k6
- name: Run load test
run: k6 run --out json=results.json tests/load/api-test.js
- name: Compare with baseline
run: |
# Extract p95 from results
P95=$(jq -r '.data.metrics.http_req_duration.values["p(95)"]' results.json)
BASELINE=450 # ms
if (( $(echo "$P95 > $BASELINE" | bc -l) )); then
echo "::error::p95 latency regression: ${P95}ms > ${BASELINE}ms baseline"
exit 1
fi
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: load-test-results
path: results.json
Performance regression detection:
│
├─ Establish baseline
│ ├─ Run load test on main branch after each merge
│ ├─ Store results in a time-series DB or artifact storage
│ └─ Track p50, p95, p99, throughput, error rate
│
├─ PR comparison
│ ├─ Run same test on PR branch
│ ├─ Compare against baseline
│ ├─ Alert if metrics degrade beyond threshold
│ └─ Common thresholds: >10% p95 increase, >5% throughput decrease
│
├─ Statistical significance
│ ├─ Run test multiple times (3-5x) to account for noise
│ ├─ Use statistical tests (t-test) to confirm regression
│ └─ Avoid false positives from system noise
│
└─ Trend tracking
├─ Plot metrics over time across releases
├─ Catch gradual degradation that per-PR tests miss
└─ Set alerts for multi-week trends
Realistic test data:
│
├─ Data volume
│ ├─ Match production data volume (or representative subset)
│ ├─ Empty DB gives misleadingly good results
│ └─ Index effectiveness depends on data distribution
│
├─ Data variety
│ ├─ Use parameterized inputs (not same request every time)
│ ├─ Vary payload sizes
│ ├─ Include edge cases (long strings, Unicode, special chars)
│ └─ Distribute IDs to avoid cache hot-spotting
│
├─ Data isolation
│ ├─ Each test run should use clean or isolated data
│ ├─ Tests that modify data should not affect next run
│ ├─ Use database transactions/rollback or test-specific namespaces
│ └─ Avoid accumulating data across test runs
│
└─ Data generation
├─ k6: use SharedArray for CSV/JSON data files
├─ Artillery: use CSV feeders, custom functions
├─ Locust: use Python libraries (Faker) for realistic data
└─ General: pre-generate data, load before test