Тестирование распределённых систем
Почему тестирование распределённых систем сложно
Распределённые системы имеют экспоненциально больше возможных состояний, чем обычные. Баги проявляются только при специфических комбинациях сбоев, которые сложно воспроизвести.
Типы проблем
| Проблема | Почему сложно обнаружить |
|---|---|
| Race condition | Проявляется только при определённом timing |
| Data loss | Возникает при partition + failover |
| Split-brain | Только при специфической топологии partition |
| Cascading failure | Нагрузка + отказ + retry storm |
| Clock skew | Проявляется при определённой разнице часов |
Факт: Netflix, Google, Amazon обнаружили критические баги в production, которые не удалось воспроизвести в тестовой среде. Это привело к развитию chaos engineering.
Chaos Engineering
Принципы
Chaos Engineering -- дисциплина экспериментов с production-системой для повышения уверенности в её устойчивости к сбоям.
Процесс:
- Определить нормальное поведение (steady state)
- Сформулировать гипотезу: "Система продолжит работать при X"
- Ввести реальный сбой (experiment)
- Сравнить поведение с ожидаемым
- Исправить обнаруженные проблемы
Типы экспериментов
| Эксперимент | Что имитируем | Инструмент |
|---|---|---|
| Kill random instance | Crash failure | Chaos Monkey (Netflix) |
| Network latency | Slow network | tc (Linux), Toxiproxy |
| Network partition | Split-brain | iptables, Blockade |
| CPU stress | Resource exhaustion | stress-ng |
| Disk full | Storage failure | dd, fallocate |
| Clock skew | Time synchronization | chrony, libfaketime |
| DNS failure | Service discovery | Blackhole DNS |
| Dependency failure | External service down | Toxiproxy, WireMock |
Netflix Simian Army
Netflix создала набор инструментов для chaos testing:
| Инструмент | Что делает |
|---|---|
| Chaos Monkey | Убивает случайные инстансы |
| Latency Monkey | Добавляет задержки |
| Chaos Kong | "Убивает" целый регион AWS |
| Conformity Monkey | Проверяет соответствие стандартам |
Fault Injection
В коде
<?php
declare(strict_types=1);
/**
* Fault injection framework for testing distributed system resilience.
* Inject failures to verify error handling and recovery.
*/
final class FaultInjector
{
/** @var array<string, array{type: string, probability: float, config: array}> */
private array $faults = [];
private bool $enabled;
public function __construct(bool $enabled = false)
{
// Only enable in testing/staging environments
$this->enabled = $enabled && getenv('APP_ENV') !== 'production';
}
/**
* Register a fault scenario.
*/
public function registerFault(
string $name,
string $type,
float $probability = 0.1,
array $config = [],
): void {
$this->faults[$name] = [
'type' => $type,
'probability' => $probability,
'config' => $config,
];
}
/**
* Apply fault injection before an operation.
* Call this before database queries, HTTP calls, etc.
*
* @throws \RuntimeException If fault triggers an exception
*/
public function maybeInjectFault(string $faultName): void
{
if (!$this->enabled) {
return;
}
$fault = $this->faults[$faultName] ?? null;
if ($fault === null) {
return;
}
// Random probability check
if (mt_rand(1, 1000) > (int) ($fault['probability'] * 1000)) {
return;
}
match ($fault['type']) {
'delay' => usleep(($fault['config']['delay_ms'] ?? 1000) * 1000),
'exception' => throw new \RuntimeException(
$fault['config']['message'] ?? 'Injected fault: ' . $faultName
),
'timeout' => sleep($fault['config']['timeout_sec'] ?? 30),
'corrupt' => null, // Return corrupted data in wrapper
default => null,
};
}
/**
* Wrap a callable with fault injection.
*
* @template T
* @param callable(): T $operation
* @return T
*/
public function wrapWithFault(string $faultName, callable $operation): mixed
{
$this->maybeInjectFault($faultName);
return $operation();
}
}
// Setup for testing
$injector = new FaultInjector(enabled: true);
// Register possible faults
$injector->registerFault(
name: 'database_slow',
type: 'delay',
probability: 0.3, // 30% chance
config: ['delay_ms' => 2000],
);
$injector->registerFault(
name: 'payment_gateway_error',
type: 'exception',
probability: 0.1, // 10% chance
config: ['message' => 'Payment gateway timeout'],
);
$injector->registerFault(
name: 'redis_timeout',
type: 'timeout',
probability: 0.05, // 5% chance
config: ['timeout_sec' => 5],
);
package faultinject
import (
"errors"
"math/rand/v2"
"os"
"sync"
"time"
)
// FaultConfig describes a registered fault scenario.
type FaultConfig struct {
Type string // "delay", "error", "timeout"
Probability float64 // 0.0 to 1.0
DelayMS int
TimeoutSec int
Message string
}
// FaultInjector injects failures for testing distributed system resilience.
type FaultInjector struct {
mu sync.RWMutex
enabled bool
faults map[string]FaultConfig
}
func New(enabled bool) *FaultInjector {
// Only enable in testing/staging environments
if os.Getenv("APP_ENV") == "production" {
enabled = false
}
return &FaultInjector{
enabled: enabled,
faults: make(map[string]FaultConfig),
}
}
// RegisterFault adds a fault scenario.
func (fi *FaultInjector) RegisterFault(name string, cfg FaultConfig) {
fi.mu.Lock()
fi.faults[name] = cfg
fi.mu.Unlock()
}
// MaybeInjectFault triggers a fault based on probability.
func (fi *FaultInjector) MaybeInjectFault(name string) error {
if !fi.enabled {
return nil
}
fi.mu.RLock()
fault, ok := fi.faults[name]
fi.mu.RUnlock()
if !ok || rand.Float64() > fault.Probability {
return nil
}
switch fault.Type {
case "delay":
time.Sleep(time.Duration(fault.DelayMS) * time.Millisecond)
case "error":
msg := fault.Message
if msg == "" {
msg = "injected fault: " + name
}
return errors.New(msg)
case "timeout":
time.Sleep(time.Duration(fault.TimeoutSec) * time.Second)
}
return nil
}
// WrapWithFault wraps an operation with fault injection.
func (fi *FaultInjector) WrapWithFault(name string, operation func() error) error {
if err := fi.MaybeInjectFault(name); err != nil {
return err
}
return operation()
}
<?php
declare(strict_types=1);
/**
* Database client with fault injection support.
* Demonstrates how to test retry and timeout behavior.
*/
final class ResilientDatabaseClient
{
public function __construct(
private readonly \PDO $pdo,
private readonly FaultInjector $faultInjector,
private readonly int $maxRetries = 3,
private readonly int $timeoutMs = 5000,
) {}
/**
* Execute query with retry logic and fault injection.
*
* @return array<array<string, mixed>>
*/
public function query(string $sql, array $params = []): array
{
$lastException = null;
for ($attempt = 1; $attempt <= $this->maxRetries; $attempt++) {
try {
// Fault injection point: may throw or add delay
$this->faultInjector->maybeInjectFault('database_slow');
$stmt = $this->pdo->prepare($sql);
$stmt->execute($params);
return $stmt->fetchAll(\PDO::FETCH_ASSOC);
} catch (\PDOException $e) {
$lastException = $e;
// Only retry on transient errors
if (!$this->isTransientError($e)) {
throw $e;
}
// Exponential backoff
$delay = min(100 * (2 ** ($attempt - 1)), 5000);
usleep($delay * 1000);
} catch (\RuntimeException $e) {
// Fault injection exception
$lastException = $e;
if ($attempt === $this->maxRetries) {
throw $e;
}
usleep(100_000); // 100ms
}
}
throw $lastException;
}
private function isTransientError(\PDOException $e): bool
{
$transientCodes = [
'40001', // serialization failure
'40P01', // deadlock detected
'57P01', // admin shutdown
'08006', // connection failure
];
return in_array($e->getCode(), $transientCodes, true);
}
}
package faultinject
import (
"context"
"database/sql"
"fmt"
"math"
"time"
)
// ResilientDB wraps database operations with retry logic and fault injection.
type ResilientDB struct {
db *sql.DB
injector *FaultInjector
maxRetries int
}
func NewResilientDB(db *sql.DB, injector *FaultInjector, maxRetries int) *ResilientDB {
return &ResilientDB{db: db, injector: injector, maxRetries: maxRetries}
}
// Query executes a query with retry logic and fault injection.
func (r *ResilientDB) Query(ctx context.Context, query string, args ...any) (*sql.Rows, error) {
var lastErr error
for attempt := 1; attempt <= r.maxRetries; attempt++ {
// Fault injection point
if err := r.injector.MaybeInjectFault("database_slow"); err != nil {
lastErr = err
if attempt == r.maxRetries {
return nil, fmt.Errorf("fault after %d attempts: %w", attempt, err)
}
time.Sleep(100 * time.Millisecond)
continue
}
rows, err := r.db.QueryContext(ctx, query, args...)
if err == nil {
return rows, nil
}
lastErr = err
// Only retry on transient errors
if !isTransientError(err) {
return nil, err
}
// Exponential backoff
delay := time.Duration(math.Min(
float64(100*time.Millisecond)*math.Pow(2, float64(attempt-1)),
float64(5*time.Second),
))
time.Sleep(delay)
}
return nil, fmt.Errorf("query failed after %d attempts: %w", r.maxRetries, lastErr)
}
func isTransientError(err error) bool {
// In production, check for specific PostgreSQL error codes:
// 40001 (serialization failure), 40P01 (deadlock),
// 57P01 (admin shutdown), 08006 (connection failure)
return true // simplified: retry all errors
}
Идея
Вместо конкретных тест-кейсов, определяем свойства (properties), которые должны выполняться для любых входных данных. Фреймворк генерирует случайные данные и проверяет свойства.
Свойства для распределённых систем
| Свойство | Описание | Проверка |
|---|---|---|
| Idempotency | Повторный запрос даёт тот же результат | f(x) == f(f(x)) |
| Commutativity | Порядок не важен | f(a,b) == f(b,a) |
| Convergence | Реплики сходятся | После синхронизации все одинаковы |
| Monotonicity | Значения не уменьшаются | counter(t+1) >= counter(t) |
| Linearizability | История допустима | Knossos/Elle checker |
<?php
declare(strict_types=1);
/**
* Property-based testing for distributed system components.
*/
final class PropertyTests
{
/**
* Test idempotency of an operation.
* Critical property for distributed systems with retry.
*
* @param callable $operation Operation to test
* @param callable $stateReader Reads current state
* @param int $iterations Number of random inputs to test
*/
public static function testIdempotency(
callable $operation,
callable $stateReader,
int $iterations = 100,
): array {
$failures = [];
for ($i = 0; $i < $iterations; $i++) {
// Generate random input
$input = self::generateRandomInput();
// Execute once
$operation($input);
$stateAfterFirst = $stateReader();
// Execute again (retry)
$operation($input);
$stateAfterSecond = $stateReader();
if ($stateAfterFirst !== $stateAfterSecond) {
$failures[] = [
'input' => $input,
'after_first' => $stateAfterFirst,
'after_second' => $stateAfterSecond,
];
}
}
return [
'passed' => $failures === [],
'iterations' => $iterations,
'failures' => $failures,
];
}
/**
* Test that a counter is monotonically increasing.
* Important property for sequence generators, version counters.
*/
public static function testMonotonicity(
callable $generateNext,
int $iterations = 1000,
): array {
$previous = $generateNext();
$failures = [];
for ($i = 1; $i < $iterations; $i++) {
$current = $generateNext();
if ($current <= $previous) {
$failures[] = [
'iteration' => $i,
'previous' => $previous,
'current' => $current,
];
}
$previous = $current;
}
return [
'passed' => $failures === [],
'iterations' => $iterations,
'failures' => $failures,
];
}
/**
* Test convergence of CRDT-like data structures.
* After merging all replicas, they should be identical.
*/
public static function testConvergence(
callable $createReplica,
callable $applyOperation,
callable $merge,
callable $getState,
int $replicas = 3,
int $operations = 50,
): array {
// Create replicas
$replicaStates = [];
for ($r = 0; $r < $replicas; $r++) {
$replicaStates[$r] = $createReplica();
}
// Apply random operations to random replicas
for ($i = 0; $i < $operations; $i++) {
$targetReplica = mt_rand(0, $replicas - 1);
$replicaStates[$targetReplica] = $applyOperation(
$replicaStates[$targetReplica],
self::generateRandomInput(),
);
}
// Merge all replicas pairwise until convergence
$merged = $replicaStates[0];
for ($r = 1; $r < $replicas; $r++) {
$merged = $merge($merged, $replicaStates[$r]);
}
// Verify all replicas converge to same state after merge
$finalState = $getState($merged);
$allSame = true;
for ($r = 0; $r < $replicas; $r++) {
$mergedWithAll = $replicaStates[$r];
for ($other = 0; $other < $replicas; $other++) {
if ($other !== $r) {
$mergedWithAll = $merge($mergedWithAll, $replicaStates[$other]);
}
}
if ($getState($mergedWithAll) !== $finalState) {
$allSame = false;
}
}
return [
'converged' => $allSame,
'replicas' => $replicas,
'operations' => $operations,
];
}
private static function generateRandomInput(): array
{
return [
'key' => 'key_' . mt_rand(1, 100),
'value' => mt_rand(1, 10000),
'timestamp' => microtime(true),
];
}
}
package proptest
import (
"fmt"
"math/rand/v2"
"reflect"
"time"
)
// TestResult holds the result of a property-based test.
type TestResult struct {
Passed bool
Iterations int
Failures []string
}
// TestIdempotency verifies that an operation produces the same state when applied twice.
func TestIdempotency(
operation func(input map[string]any),
stateReader func() any,
iterations int,
) TestResult {
var failures []string
for i := range iterations {
input := generateRandomInput()
operation(input)
stateAfterFirst := stateReader()
operation(input)
stateAfterSecond := stateReader()
if !reflect.DeepEqual(stateAfterFirst, stateAfterSecond) {
failures = append(failures, fmt.Sprintf(
"iteration %d: state changed on retry", i,
))
}
}
return TestResult{Passed: len(failures) == 0, Iterations: iterations, Failures: failures}
}
// TestMonotonicity verifies that generated values are strictly increasing.
func TestMonotonicity(generateNext func() int64, iterations int) TestResult {
var failures []string
previous := generateNext()
for i := 1; i < iterations; i++ {
current := generateNext()
if current <= previous {
failures = append(failures, fmt.Sprintf(
"iteration %d: %d <= %d", i, current, previous,
))
}
previous = current
}
return TestResult{Passed: len(failures) == 0, Iterations: iterations, Failures: failures}
}
// ConvergenceResult holds the CRDT convergence test result.
type ConvergenceResult struct {
Converged bool
Replicas int
Operations int
}
// TestConvergence verifies that CRDT-like structures converge after merge.
func TestConvergence[T any](
createReplica func() T,
applyOp func(T, map[string]any) T,
merge func(T, T) T,
getState func(T) any,
replicas, operations int,
) ConvergenceResult {
states := make([]T, replicas)
for r := range replicas {
states[r] = createReplica()
}
for range operations {
target := rand.IntN(replicas)
states[target] = applyOp(states[target], generateRandomInput())
}
// Merge all and verify convergence
merged := states[0]
for r := 1; r < replicas; r++ {
merged = merge(merged, states[r])
}
finalState := getState(merged)
allSame := true
for r := range replicas {
m := states[r]
for other := range replicas {
if other != r {
m = merge(m, states[other])
}
}
if !reflect.DeepEqual(getState(m), finalState) {
allSame = false
break
}
}
return ConvergenceResult{Converged: allSame, Replicas: replicas, Operations: operations}
}
func generateRandomInput() map[string]any {
return map[string]any{
"key": fmt.Sprintf("key_%d", rand.IntN(100)+1),
"value": rand.IntN(10000) + 1,
"timestamp": time.Now().UnixMicro(),
}
}
<?php
declare(strict_types=1);
/**
* Testing retry behavior with controlled failures.
* Verifies that retry logic handles transient failures correctly.
*/
final class RetryTestHelper
{
private int $callCount = 0;
private int $failUntilAttempt;
private string $failureType;
public function __construct(
int $failUntilAttempt = 3,
string $failureType = 'exception',
) {
$this->failUntilAttempt = $failUntilAttempt;
$this->failureType = $failureType;
}
/**
* Operation that fails N times before succeeding.
* Use this to test retry logic.
*/
public function unreliableOperation(): string
{
$this->callCount++;
if ($this->callCount < $this->failUntilAttempt) {
match ($this->failureType) {
'exception' => throw new \RuntimeException(
"Transient failure (attempt {$this->callCount})"
),
'timeout' => throw new \RuntimeException('Connection timed out'),
'http_503' => throw new \RuntimeException('503 Service Unavailable'),
default => throw new \RuntimeException('Unknown failure'),
};
}
return 'success';
}
public function getCallCount(): int
{
return $this->callCount;
}
public function reset(): void
{
$this->callCount = 0;
}
}
/**
* PHPUnit test example for retry logic.
*/
final class RetryStrategyTest
{
/**
* Test that retry succeeds after transient failures.
*/
public static function testRetrySucceedsAfterTransientFailures(): void
{
$helper = new RetryTestHelper(failUntilAttempt: 3);
$result = RetryStrategy::withExponentialBackoff(
operation: fn () => $helper->unreliableOperation(),
maxRetries: 5,
baseDelayMs: 10,
);
assert($result === 'success', 'Should succeed after retries');
assert($helper->getCallCount() === 3, 'Should have called 3 times');
}
/**
* Test that retry gives up after max retries.
*/
public static function testRetryGivesUpAfterMaxRetries(): void
{
$helper = new RetryTestHelper(failUntilAttempt: 10); // More failures than retries
$failed = false;
try {
RetryStrategy::withExponentialBackoff(
operation: fn () => $helper->unreliableOperation(),
maxRetries: 3,
baseDelayMs: 10,
);
} catch (\RuntimeException $e) {
$failed = true;
}
assert($failed, 'Should throw after max retries');
assert($helper->getCallCount() === 4, 'Should have called 4 times (initial + 3 retries)');
}
/**
* Test that backoff delay increases exponentially.
*/
public static function testExponentialBackoffTiming(): void
{
$helper = new RetryTestHelper(failUntilAttempt: 4);
$startTime = microtime(true);
RetryStrategy::withExponentialBackoff(
operation: fn () => $helper->unreliableOperation(),
maxRetries: 5,
baseDelayMs: 100,
);
$elapsed = (microtime(true) - $startTime) * 1000;
// Expected delays: 100ms, 200ms, 400ms = 700ms minimum
assert($elapsed > 500, 'Total delay should be at least 500ms');
assert($elapsed < 2000, 'Total delay should not exceed 2s (with jitter)');
}
}
// Run tests
RetryStrategyTest::testRetrySucceedsAfterTransientFailures();
RetryStrategyTest::testRetryGivesUpAfterMaxRetries();
RetryStrategyTest::testExponentialBackoffTiming();
Инструменты тестирования
| Инструмент | Тип | Применение |
|---|---|---|
| Jepsen | Consistency testing | Проверка гарантий консистентности |
| Chaos Monkey | Chaos engineering | Убийство случайных инстансов |
| Toxiproxy | Network simulation | Задержки, разрывы, ограничения |
| Litmus | Formal verification | Проверка моделей |
| Blockade | Docker chaos | Partition между контейнерами |
| tc (Linux) | Network shaping | Задержки, потеря пакетов |
Выводы
- Распределённые системы нужно тестировать под нагрузкой со сбоями
- Chaos engineering -- дисциплина экспериментов с production для повышения устойчивости
- Fault injection позволяет имитировать сбои в коде (задержки, исключения, таймауты)
- Property-based testing проверяет инварианты (idempotency, monotonicity, convergence)
- Retry logic нужно тестировать с контролируемыми сбоями
- Jepsen -- золотой стандарт для тестирования гарантий консистентности
- Тестируйте recovery, а не только happy path