fix(websocket): order register/unregister via single ops channel

Two separate channels under one select gave Go's randomness the chance
to process an unregister before its matching register from the same
goroutine, leaking the entry into the client map. Replace with a single
ordered ops channel so program order is preserved end-to-end.
This commit is contained in:
MHSanaei
2026-05-19 12:34:53 +02:00
parent 85e2ded0e1
commit 6000bc7134

View File

@@ -29,11 +29,23 @@ const (
enqueueTimeout = 100 * time.Millisecond enqueueTimeout = 100 * time.Millisecond
clientSendQueue = 512 // ~50s of buffering for a momentarily slow browser. clientSendQueue = 512 // ~50s of buffering for a momentarily slow browser.
hubBroadcastQueue = 2048 // Headroom for cron-storm + admin-mutation bursts. hubBroadcastQueue = 2048 // Headroom for cron-storm + admin-mutation bursts.
hubControlQueue = 64 // Backlog for register/unregister bursts (page reloads, disconnect storms). hubOpsQueue = 128 // Backlog for register+unregister bursts (page reloads, disconnect storms).
minBroadcastInterval = 250 * time.Millisecond minBroadcastInterval = 250 * time.Millisecond
hubRestartAttempts = 3 hubRestartAttempts = 3
) )
type clientOpKind int
const (
opRegister clientOpKind = iota
opUnregister
)
type clientOp struct {
kind clientOpKind
c *Client
}
// NewClient builds a Client ready for hub registration. // NewClient builds a Client ready for hub registration.
func NewClient(id string) *Client { func NewClient(id string) *Client {
return &Client{ return &Client{
@@ -58,13 +70,12 @@ type Client struct {
// Hub fan-outs messages to all connected clients. // Hub fan-outs messages to all connected clients.
type Hub struct { type Hub struct {
clients map[*Client]struct{} clients map[*Client]struct{}
broadcast chan []byte broadcast chan []byte
register chan *Client ops chan clientOp
unregister chan *Client mu sync.RWMutex
mu sync.RWMutex ctx context.Context
ctx context.Context cancel context.CancelFunc
cancel context.CancelFunc
throttleMu sync.Mutex throttleMu sync.Mutex
lastBroadcast map[MessageType]time.Time lastBroadcast map[MessageType]time.Time
@@ -76,8 +87,7 @@ func NewHub() *Hub {
return &Hub{ return &Hub{
clients: make(map[*Client]struct{}), clients: make(map[*Client]struct{}),
broadcast: make(chan []byte, hubBroadcastQueue), broadcast: make(chan []byte, hubBroadcastQueue),
register: make(chan *Client, hubControlQueue), ops: make(chan clientOp, hubOpsQueue),
unregister: make(chan *Client, hubControlQueue),
ctx: ctx, ctx: ctx,
cancel: cancel, cancel: cancel,
lastBroadcast: make(map[MessageType]time.Time), lastBroadcast: make(map[MessageType]time.Time),
@@ -145,21 +155,20 @@ func (h *Hub) runOnce() (stopped bool) {
h.shutdown() h.shutdown()
return true return true
case c := <-h.register: case op := <-h.ops:
if c == nil { if op.c == nil {
continue continue
} }
h.mu.Lock() switch op.kind {
h.clients[c] = struct{}{} case opRegister:
n := len(h.clients) h.mu.Lock()
h.mu.Unlock() h.clients[op.c] = struct{}{}
logger.Debugf("WebSocket client connected: %s (total: %d)", c.ID, n) n := len(h.clients)
h.mu.Unlock()
case c := <-h.unregister: logger.Debugf("WebSocket client connected: %s (total: %d)", op.c.ID, n)
if c == nil { case opUnregister:
continue h.removeClient(op.c)
} }
h.removeClient(c)
case msg := <-h.broadcast: case msg := <-h.broadcast:
h.fanout(msg) h.fanout(msg)
@@ -321,29 +330,29 @@ func (h *Hub) Register(c *Client) {
return return
} }
select { select {
case h.register <- c: case h.ops <- clientOp{kind: opRegister, c: c}:
case <-h.ctx.Done(): case <-h.ctx.Done():
} }
} }
// Unregister removes a client from the hub. Fast path queues for the hub // Unregister removes a client from the hub. Sends through the same ordered
// goroutine; if the channel is saturated (disconnect storm) we fall back // ops channel as Register so a register-then-unregister sequence from one
// to a direct removal under the write lock so dead clients aren't left in // goroutine is processed in program order — otherwise an unregister could
// the registry waiting for their Send buffer to fill (minutes of wasted // land in the map before its register and silently no-op, leaking the entry.
// fanout work at low broadcast rates).
// //
// Direct removal is safe from any caller: external goroutines (read/write // On a saturated ops channel (disconnect storm) we fall back to a bounded
// pumps) hold no hub locks, and the hub goroutine itself never holds h.mu // timeout drop rather than direct removal: a direct delete on a not-yet-
// when it calls Unregister — fanout releases its RLock before per-client // registered client is precisely the ordering bug we fix here. Stragglers
// sends, so we can't self-deadlock here. // get evicted by fanout when their Send buffer fills.
func (h *Hub) Unregister(c *Client) { func (h *Hub) Unregister(c *Client) {
if h == nil || c == nil { if h == nil || c == nil {
return return
} }
select { select {
case h.unregister <- c: case h.ops <- clientOp{kind: opUnregister, c: c}:
default: case <-time.After(enqueueTimeout):
h.removeClient(c) logger.Warningf("WebSocket ops channel full, dropping unregister for %s", c.ID)
case <-h.ctx.Done():
} }
} }