Flaky Async Tests (Death by `sleep`)

If your async tests rely on fixed delays, you're gambling. Sometimes CI is slow, sometimes your laptop is fast — either way, `sleep(2000)` doesn't prove correctness, it hides races.

TL;DR

  • Don't wait for time; wait for signals.
  • Use synchronization (channels, events, latches) or poll-with-timeout patterns.
  • Prefer fake clocks to advance time instantly.

Why `sleep` is a smell

  1. Non-deterministic: timing varies across machines/CI.
  2. Slow: padding tests with seconds kills your suite speed.
  3. Racy: might pass locally, fail in CI — or vice versa.

Better Patterns

1) Signal Completion Explicitly

Have the worker signal when it's done with an operation.

Go

func Process(ctx context.Context, in <-chan int, out chan<- int, done chan<- struct{}) {
    for v := range in {
        out <- v * 2
    }
    close(done)
}

func TestProcess_SignalsDone(t *testing.T) {
    in := make(chan int, 1)
    out := make(chan int, 1)
    done := make(chan struct{})
    ctx := context.Background()

    go Process(ctx, in, out, done)
    in <- 21
    close(in)

    select {
    case v := <-out:
        if v != 42 { t.Fatalf("got %d", v) }
    case <-time.After(time.Second):
        t.Fatal("timeout waiting for output")
    }

    select {
    case <-done: // ✅ deterministic completion signal
    case <-time.After(time.Second):
        t.Fatal("timeout waiting for done")
    }
}

Node/JS (events or promises)

import { EventEmitter } from 'node:events';
import { strict as assert } from 'node:assert';

function processEE() {
  const ee = new EventEmitter();
  queueMicrotask(() => { ee.emit('data', 42); ee.emit('done'); });
  return ee;
}

test('signals done', async () => {
  const ee = processEE();
  const got = await new Promise((res, rej) => {
    const timer = setTimeout(() => rej(new Error('timeout')), 1000);
    ee.once('data', (v) => { clearTimeout(timer); res(v); });
  });
  assert.equal(got, 42);
});

2) Poll with a Deadline (a.k.a. Eventually)

Use a retry loop with a deadline, not a fixed sleep.

Go

func eventually(t *testing.T, d time.Duration, f func() bool) {
    deadline := time.Now().Add(d)
    for time.Now().Before(deadline) {
        if f() { return }
        time.Sleep(10 * time.Millisecond)
    }
    t.Fatalf("condition not met within %s", d)
}

func TestCache_WritesEventuallyVisible(t *testing.T) {
    c := NewCache()
    c.AsyncWrite("k", "v")
    eventually(t, 500*time.Millisecond, func() bool { return c.Get("k") == "v" })
}

Node/JS

async function eventually(ms, fn, step = 10) {
  const start = Date.now();
  while (Date.now() - start < ms) {
    if (await fn()) return;
    await new Promise(r => setTimeout(r, step));
  }
  throw new Error('condition not met');
}

test('cache write visible', async () => {
  const c = new Cache();
  c.asyncWrite('k','v');
  await eventually(500, () => c.get('k') === 'v');
});

3) Fake the Clock

Advance time instantly to test time-dependent logic.

Go

type Clock interface{ Now() time.Time }

type fakeClock struct{ t time.Time }
func (f *fakeClock) Now() time.Time { return f.t }
func (f *fakeClock) Advance(d time.Duration) { f.t = f.t.Add(d) }

func TestToken_Expires(t *testing.T) {
    fc := &fakeClock{t: time.Unix(0,0)}
    tok := NewToken(clock=fc)
    if tok.Expired() { t.Fatal("unexpected expired") }
    fc.Advance(24 * time.Hour)
    if !tok.Expired() { t.Fatal("expected expired") }
}

Node/JS (fake timers)

import { test, mock } from 'node:test';
import assert from 'node:assert';

test('expires after 24h', (t) => {
  mock.timers.enable({ apis: ['setTimeout', 'Date'] });
  const tok = new Token();
  assert.equal(tok.expired(), false);
  mock.timers.tick(24 * 60 * 60 * 1000);
  assert.equal(tok.expired(), true);
  mock.timers.reset();
});

4) Latches & WaitGroups

Coordinate multiple goroutines/tasks finishing work.

Go

var wg sync.WaitGroup
wg.Add(2)
go func(){ defer wg.Done(); producer() }()
go func(){ defer wg.Done(); consumer() }()
wg.Wait() // no sleeps

Node/JS

await Promise.all([ taskA(), taskB() ]); // deterministic join

Anti-Patterns to Delete or Refactor

  • sleep(2000) "just to be safe."
  • Global flags that tests poll without isolation.
  • Assertions that depend on execution order when the system is concurrent by design.

Debugging Flakes Checklist

  • Seed your PRNG; inject clocks/ID gens.
  • Replace sleeps with signals or deadlines.
  • Capture logs with timestamps; add trace IDs.
  • Run tests with -race (Go) or concurrency diagnostics.

Golden Rules

👉Wait for events, not time.
👉Advance fake time, don't burn real time.
👉If a test needs more than a tiny retry window, redesign the API to expose a signal.