package e2e import ( "bufio" "bytes" "context" "errors" "flag" "fmt" "io" "net" "runtime" "slices" "testing" "time" enginebuiltin "github.com/openlibrecommunity/olcrtc/internal/engine/builtin" ) var ( errStressNoRoundtrips = errors.New("no successful roundtrips within duration") errStressPayloadMatch = errors.New("payload mismatch") errStressNoBulkProgress = errors.New("bulk pump made zero progress") ) var ( realStress = flag.Bool( //nolint:gochecknoglobals // package-level state intentional "olcrtc.stress", false, "run real provider stress matrix (bulk transfer + sustained echo) - requires -olcrtc.real-e2e", ) realStressBulkDuration = flag.Duration( //nolint:gochecknoglobals // package-level state intentional "olcrtc.stress-bulk-duration", 60*time.Second, "per-case duration for the bulk pattern-pump phase (set 0 to skip). "+ "Throughput differs by ~3 orders of magnitude across transports "+ "(datachannel: MiB/s; videochannel: KB/s), so we measure how much "+ "flows in a fixed time rather than fixing the byte budget.", ) realStressDuration = flag.Duration( //nolint:gochecknoglobals // package-level state intentional "olcrtc.stress-duration", 30*time.Second, "per-case duration for the sustained echo phase (set 0 to skip)", ) realStressEchoSize = flag.Int( //nolint:gochecknoglobals // package-level state intentional "olcrtc.stress-echo-size", 1024, "single-roundtrip payload size during the sustained echo phase", ) realStressCaseTimeout = flag.Duration( //nolint:gochecknoglobals // package-level state intentional "olcrtc.stress-case-timeout", 5*time.Minute, "hard timeout per stress carrier×transport case (covers connect + bulk + echo)", ) realStressBulkChunkSize = flag.Int( //nolint:gochecknoglobals // package-level state intentional "olcrtc.stress-bulk-chunk", 4096, "bulk request-response chunk size in bytes", ) ) // TestRealProviderTransportStress exercises every real carrier×transport // combination under load. For each pair, two phases run sequentially over // a single SOCKS connection: // // 1. Bulk phase: stream a deterministic byte pattern through the tunnel // for -olcrtc.stress-bulk-duration and verify it echoes back byte-for- // byte. Reports observed throughput. Different transports differ by // orders of magnitude (qr-encoded videochannel vs SCTP datachannel), // so we measure rather than assert a fixed budget. // 2. Echo phase: send -olcrtc.stress-echo-size payloads as fast as the // loop will go for -olcrtc.stress-duration, recording per-RT latency // and computing p50/p95/p99. // // Around both phases we snapshot runtime.NumGoroutine to surface obvious // goroutine leaks introduced by reconnect / bytestream / epoch regressions. // // Gated by -olcrtc.stress so it never runs on every push; intended for the // nightly soak job in CI and for local stress profiling. // //nolint:cyclop // matrix of carrier×transport expectations is naturally branchy func TestRealProviderTransportStress(t *testing.T) { if !*realE2E { t.Skip("real provider e2e disabled; pass -olcrtc.real-e2e to enable") } if !*realStress { t.Skip("stress disabled; pass -olcrtc.stress to enable") } carriers := splitTestList(*realE2ECarriers) transports := splitTestList(*realE2ETransports) if len(carriers) == 0 { t.Fatal("no real e2e carriers selected") } if len(transports) == 0 { t.Fatal("no real e2e transports selected") } echoAddr := startEchoServer(t) for _, carrierName := range carriers { t.Run(carrierName, func(t *testing.T) { roomCtx, cancelRoom := context.WithTimeout(context.Background(), *realStressCaseTimeout) defer cancelRoom() roomURL := requireRealRoom(roomCtx, t, carrierName) var authFailed bool for _, transportName := range transports { t.Run(transportName, func(t *testing.T) { if authFailed { t.Skip("skipping: carrier auth failed on previous transport") } expectation := realE2ECaseExpectation(carrierName, transportName) if expectation == realE2EExpectFail { t.Skip("skipping: combo not expected to pass even at baseline") } err := runRealE2EStressCase(t, carrierName, transportName, roomURL, echoAddr) if err != nil && errors.Is(err, enginebuiltin.ErrAuthFailed) { authFailed = true t.Skipf("skip %s stress: auth failed: %v", carrierName, err) } switch { case err == nil: t.Logf("STRESS OK %s/%s", carrierName, transportName) case expectation == realE2EExpectUnstable: logUnstableOutcome(t, "STRESS UNSTABLE", carrierName, transportName, err) default: t.Fatalf("STRESS FAIL %s/%s: %v", carrierName, transportName, err) } }) } }) } } //nolint:cyclop // two phases plus tunnel/connection setup naturally branch func runRealE2EStressCase(t *testing.T, carrierName, transportName, roomURL, echoAddr string) (err error) { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), *realStressCaseTimeout) defer cancel() goroutinesBefore := runtime.NumGoroutine() rt, err := startRealTunnel(ctx, t, carrierName, transportName, roomURL, testClientDeviceID, testClientDeviceID) if err != nil { return err } defer func() { if stopErr := rt.stopErr(); err == nil && stopErr != nil { err = stopErr } }() conn, err := connectViaSOCKSWithin(rt.socksAddr, echoAddr, *realStressCaseTimeout) if err != nil { return err } defer func() { _ = conn.Close() }() if d := *realStressBulkDuration; d > 0 { written, dur, err := streamPatternForDuration(conn, d, *realStressBulkChunkSize) if err != nil { return fmt.Errorf("bulk pump: %w", err) } throughput := float64(written) / dur.Seconds() / (1 << 20) t.Logf("bulk %s/%s: %d bytes in %s (%.3f MiB/s)", carrierName, transportName, written, dur, throughput) if written == 0 { return errStressNoBulkProgress } } if d := *realStressDuration; d > 0 { stats, err := sustainedEcho(conn, *realStressEchoSize, d) if err != nil { return fmt.Errorf("sustained echo: %w", err) } t.Logf("echo %s/%s: %d rt in %s, p50=%s p95=%s p99=%s max=%s lost=%d", carrierName, transportName, stats.count, d, stats.p50, stats.p95, stats.p99, stats.maxLatency, stats.lost) if stats.count == 0 { return fmt.Errorf("%w: %s", errStressNoRoundtrips, d) } } goroutinesAfter := runtime.NumGoroutine() // Allow some slack - pion/quic spawn helpers that take time to wind down // after Close, but a real leak shows up as tens of extra goroutines. const goroutineLeakSlack = 30 if goroutinesAfter > goroutinesBefore+goroutineLeakSlack { t.Logf("WARNING: goroutines grew %d -> %d during %s/%s", goroutinesBefore, goroutinesAfter, carrierName, transportName) } return nil } // streamPatternForDuration pumps a deterministic byte pattern through conn // for at most `duration` using a synchronous request-response loop: write a // chunk, wait until the same chunk echoes back and verify, then write the // next one. Returns total bytes successfully echoed and elapsed time. // // Why request-response rather than concurrent write+read streams: // transport throughputs differ by ~3 orders of magnitude (datachannel does // MiB/s; videochannel/seichannel ~25 KB/s through 256-byte qr-encoded // frames at 25 FPS). An asynchronous writer outruns a slow transport, // fills muxconn / SOCKS / RTP-track buffers, and the deadlocked pipe // eventually trips a TCP-write deadline - which is not a real bug, just // the natural consequence of pumping into a slow pipe with no flow // control. Request-response naturally rate-limits to the transport's // actual round-trip throughput, which is what we want to measure. func streamPatternForDuration(conn net.Conn, duration time.Duration, chunkSize int) (int64, time.Duration, error) { if chunkSize <= 0 { chunkSize = 4096 } // Per-chunk roundtrip deadline. Slow transports (videochannel) can // take seconds+ per chunk in practice; 15s gives ample margin // without making genuine stalls hang forever. const chunkTimeout = 15 * time.Second start := time.Now() deadline := start.Add(duration) buf := make([]byte, chunkSize) echoed := make([]byte, chunkSize) want := make([]byte, chunkSize) reader := bufio.NewReader(conn) var total int64 for time.Now().Before(deadline) { fillPattern(buf, total) if err := conn.SetWriteDeadline(time.Now().Add(chunkTimeout)); err != nil { return total, time.Since(start), fmt.Errorf("set write deadline at %d: %w", total, err) } if _, err := conn.Write(buf); err != nil { return total, time.Since(start), fmt.Errorf("write at %d: %w", total, err) } if err := conn.SetReadDeadline(time.Now().Add(chunkTimeout)); err != nil { return total, time.Since(start), fmt.Errorf("set read deadline at %d: %w", total, err) } if _, err := io.ReadFull(reader, echoed); err != nil { return total, time.Since(start), fmt.Errorf("read at %d: %w", total, err) } fillPattern(want, total) if !bytes.Equal(echoed, want) { return total, time.Since(start), fmt.Errorf("%w %d", errPayloadMismatchOffset, total) } total += int64(chunkSize) } return total, time.Since(start), nil } type echoStats struct { count int lost int p50, p95, p99 time.Duration maxLatency time.Duration } // sustainedEcho writes payloads of size `payloadSize` and waits for them to // echo back, recording per-roundtrip latency. Runs until duration elapses // or the underlying connection fails. Each write/read uses a deadline so a // stuck transport surfaces as a finite-time test failure rather than a hang. // //nolint:cyclop // per-rt deadlines + error wrapping naturally branch many ways func sustainedEcho(conn net.Conn, payloadSize int, duration time.Duration) (echoStats, error) { if payloadSize < 4 { payloadSize = 4 } deadline := time.Now().Add(duration) payload := make([]byte, payloadSize) for i := range payload { payload[i] = byte('a' + (i % 26)) } // Mark the payload terminator so we can ReadFull a fixed length back. payload[payloadSize-1] = '\n' reader := bufio.NewReader(conn) var stats echoStats latencies := make([]time.Duration, 0, 1024) buf := make([]byte, payloadSize) for time.Now().Before(deadline) { if err := conn.SetWriteDeadline(time.Now().Add(5 * time.Second)); err != nil { return stats, fmt.Errorf("set write deadline: %w", err) } start := time.Now() if _, err := conn.Write(payload); err != nil { stats.lost++ return stats, fmt.Errorf("write at rt #%d: %w", stats.count, err) } if err := conn.SetReadDeadline(time.Now().Add(5 * time.Second)); err != nil { return stats, fmt.Errorf("set read deadline: %w", err) } if _, err := io.ReadFull(reader, buf); err != nil { stats.lost++ return stats, fmt.Errorf("read at rt #%d: %w", stats.count, err) } lat := time.Since(start) if !bytes.Equal(buf, payload) { return stats, fmt.Errorf("%w at rt #%d", errStressPayloadMatch, stats.count) } latencies = append(latencies, lat) if lat > stats.maxLatency { stats.maxLatency = lat } stats.count++ } if len(latencies) > 0 { slices.Sort(latencies) stats.p50 = latencies[len(latencies)*50/100] stats.p95 = latencies[min(len(latencies)*95/100, len(latencies)-1)] stats.p99 = latencies[min(len(latencies)*99/100, len(latencies)-1)] } return stats, nil }