diff --git a/go.mod b/go.mod index 467ff07..a4bc3fe 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/xtaci/kcp-go/v5 v5.6.72 github.com/xtaci/smux v1.5.57 github.com/zarazaex69/gr v0.0.0-20260430043628-45b595f4fef0 - github.com/zarazaex69/j v0.0.0-20260515183030-8e13c23cdfdc + github.com/zarazaex69/j v0.0.0-20260515220619-5c2e698ab751 golang.org/x/crypto v0.50.0 golang.org/x/mobile v0.0.0-20260410095206-2cfb76559b7b google.golang.org/genproto v0.0.0-20260209200024-4cfbd4190f57 diff --git a/go.sum b/go.sum index 19ae7b8..2a51588 100644 --- a/go.sum +++ b/go.sum @@ -235,8 +235,10 @@ github.com/xtaci/smux v1.5.57/go.mod h1:IGQ9QYrBphmb/4aTnLEcJby0TNr3NV+OslIOMrX8 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/zarazaex69/gr v0.0.0-20260430043628-45b595f4fef0 h1:dMjHX/YPV3ZD/KJKFjQdlMBwj2/rZIuOVKOvGv26m9k= github.com/zarazaex69/gr v0.0.0-20260430043628-45b595f4fef0/go.mod h1:7vALI2tjaLTOGiDKV7V2JkVU9bA1YADBDQA6uvpp1ac= -github.com/zarazaex69/j v0.0.0-20260515183030-8e13c23cdfdc h1:Nz6NuOZMNSMOujclXHE4a4/6Rb5Ivl1vMdmlXEV5GCg= -github.com/zarazaex69/j v0.0.0-20260515183030-8e13c23cdfdc/go.mod h1:7/ypJTenOIPx23fpo5uF7l4u+rxZqg9cFbTL/N77Ktc= +github.com/zarazaex69/j v0.0.0-20260515213105-2ceb87d63925 h1:rGy4bUvnzt6PZGgwHqyHG4oPsv18FOzD9X4TrRpSxD0= +github.com/zarazaex69/j v0.0.0-20260515213105-2ceb87d63925/go.mod h1:7/ypJTenOIPx23fpo5uF7l4u+rxZqg9cFbTL/N77Ktc= +github.com/zarazaex69/j v0.0.0-20260515220619-5c2e698ab751 h1:waM/fqf4Wkyj5IksKdvDR0R9rwvRQcWxbXHwADpdOVI= +github.com/zarazaex69/j v0.0.0-20260515220619-5c2e698ab751/go.mod h1:7/ypJTenOIPx23fpo5uF7l4u+rxZqg9cFbTL/N77Ktc= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.1.0 h1:s7DLGDK45Dyfg7++yxI0khrfwq9661w9EN78eP/UZVs= diff --git a/internal/engine/jitsi/jitsi.go b/internal/engine/jitsi/jitsi.go index 166c7f4..2966584 100644 --- a/internal/engine/jitsi/jitsi.go +++ b/internal/engine/jitsi/jitsi.go @@ -613,61 +613,39 @@ func decodeRaw(m j.BridgeMessage) []byte { // Close terminates the session and releases resources. // -// Shutdown is performed in the order a Jitsi web client uses: +// Shutdown follows the lib-jitsi-meet JitsiConference.leave() contract: // // 1. Mark the session closed so send/recv loops drop new work. -// 2. If a pion PeerConnection was negotiated, send Jingle -// session-terminate to Jicofo so the conference state is updated and -// the JVB bridge slot is freed promptly. Without this, Jicofo only -// notices the participant is gone after the MUC presence-unavailable -// stanza, and JVB only reclaims resources after a longer idle timeout. -// 3. Close the pion PeerConnection (stops media, sends DTLS bye). -// 4. Close the underlying j.Session, which closes the colibri-ws bridge, -// sends MUC presence-unavailable, and tears down the XMPP transport. -// 5. Cancel the supervisor context and wait for goroutines. +// 2. Close the pion PeerConnection (stops media, sends DTLS bye). This +// mirrors jvbJingleSession.close() in lib-jitsi-meet — note that +// graceful leave there does NOT send Jingle session-terminate; Jicofo +// learns of the departure from the MUC presence-unavailable stanza +// and only then frees the JVB bridge slot. +// 3. Close the underlying j.Session, which closes the colibri-ws bridge, +// performs the MUC presence-unavailable handshake (LeaveMUCWait +// waits for Prosody to echo our own unavailable presence — the +// XMPP-level equivalent of XMPPEvents.MUC_LEFT — with a 5s cap), +// and only then tears down the websocket. +// 4. Cancel the supervisor context and wait for goroutines. +// +// Why no session-terminate: empirically, when the application layer (e.g. +// seichannel) wedges and the test fails before clean shutdown, Jicofo +// stops replying to our session-terminate IQ. TerminateWait then ate its +// 3s budget and we still left ghost participants behind. lib-jitsi-meet +// avoids this entirely by relying on MUC presence as the single source of +// truth for departure — Prosody's MUC layer is far more reliable than +// Jicofo's IQ handler under load. func (s *Session) Close() error { if !s.closed.CompareAndSwap(false, true) { return nil } - // Tell Jicofo we're leaving BEFORE closing any transport. The order - // matters: a half-torn-down websocket can drop the session-terminate / - // presence-unavailable stanzas, leaving the participant in the MUC - // roster until idle timeout. Subsequent tests then see ghost endpoints - // in the bridge channel and receive garbage during handshake. jSess := s.jSess.Load() - if jSess != nil { - if err := s.terminateJingleSession(jSess); err != nil { - logger.Infof("jitsi: session-terminate failed: %v", err) - } - // Send MUC presence-unavailable and give Prosody time to fan it - // out to Jicofo, which in turn must tell JVB to drop our endpoint - // before another participant under the same room can claim a - // clean slot. - // - // Why so long: the j library fires IQ stanzas without waiting - // for from Prosody, so "wrote bytes to the - // websocket" is not "Jicofo processed the request". The chain - // session-terminate → Prosody → Jicofo → JVB takes a real RTT - // plus Jicofo's own handling. With <500ms here, restarting a - // session in the same room immediately collides with a still- - // alive ghost endpoint and the new conference inherits stale - // SSRCs / source-add stanzas, which is exactly the failure mode - // we kept hitting in back-to-back e2e runs. - // - // 2s is what jitsi-meet uses internally between leave and - // resource cleanup (lib-jitsi-meet's leaveRoomEvent grace) and - // matches what the bridge expects for a clean handoff. - if conn := jSess.LowLevel(); conn != nil { - if err := conn.LeaveMUC(s.room); err != nil { - logger.Infof("jitsi: LeaveMUC failed: %v", err) - } else { - logger.Infof("jitsi: LeaveMUC sent") - } - time.Sleep(2 * time.Second) - } - } + // Close PC first so DTLS goes out and the bridge sees media stop; + // this ordering matches lib-jitsi-meet's leave() and lets the + // follow-up MUC presence unavailable hit Jicofo with PC already + // torn down (no session-terminate dance is involved). s.pcMu.Lock() pc := s.pc s.pc = nil @@ -676,6 +654,10 @@ func (s *Session) Close() error { _ = pc.Close() } + // jSess.Close() performs the MUC unavailable handshake and only then + // tears down the websocket. It logs the handshake outcome itself so + // we can distinguish "Prosody confirmed leave" from "5s timeout, + // fell back to fire-and-forget" in failure-mode investigations. if jSess != nil { _ = jSess.Close() } @@ -699,18 +681,29 @@ func (s *Session) Close() error { return nil } -// terminateJingleSession sends a Jingle session-terminate stanza to Jicofo -// so the conference state is updated immediately. Sent even when no pion -// PeerConnection was negotiated: Jicofo allocates the JVB bridge slot the -// moment it dispatches session-initiate, regardless of whether the -// participant ever sent session-accept, and an explicit session-terminate -// frees that slot promptly. +// terminateJingleSession USED to send a Jingle session-terminate stanza to +// Jicofo so the conference state is updated immediately. We removed the +// call from Close(): empirically Jicofo stops replying to that IQ when +// the participant's media path has wedged, and waiting on it cost 3s of +// shutdown budget while still leaving ghost participants behind. The +// real-world fix mirrors lib-jitsi-meet's leave path which relies purely +// on MUC presence unavailable to signal departure. +// +// We keep this helper around (calling the j-library's TerminateWait) for +// future flows where an explicit terminate is genuinely needed (e.g. ICE +// restart) — matching what lib-jitsi-meet's _stopJvbSession does for +// those specific cases. func (s *Session) terminateJingleSession(jSess *j.Session) error { neg := jSess.Negotiator() if neg == nil { return nil } - return neg.Terminate("success") + // 3s is enough for a healthy bridge round-trip; on a wedged Jicofo we + // still cap the wait. Synchronous result handling mirrors what + // lib-jitsi-meet's JingleSessionPC.terminate does via sendIQ() with a + // callback, and is what stops the next session in the same MUC from + // inheriting our endpoint's ghost. + return neg.TerminateWait("success", 3*time.Second) } // SetReconnectCallback registers a callback for reconnection events. diff --git a/internal/engine/livekit/livekit.go b/internal/engine/livekit/livekit.go index fcc9749..24c62bd 100644 --- a/internal/engine/livekit/livekit.go +++ b/internal/engine/livekit/livekit.go @@ -14,6 +14,7 @@ import ( "log" "sync" "sync/atomic" + "time" protoLogger "github.com/livekit/protocol/logger" lksdk "github.com/livekit/server-sdk-go/v2"