Files
olcrtc/internal/supervisor/supervisor.go
2026-05-16 04:13:07 +03:00

270 lines
6.7 KiB
Go

// Package supervisor runs ordered session profiles with failover.
package supervisor
import (
"context"
"errors"
"fmt"
"time"
"github.com/openlibrecommunity/olcrtc/internal/app/session"
)
// DefaultRetryDelay is used between profile attempts when Config.RetryDelay is unset.
const DefaultRetryDelay = 2 * time.Second
// DefaultHistoryLimit bounds emitted status history when Config.HistoryLimit is unset.
const DefaultHistoryLimit = 20
const (
// EventProfileStart marks a profile attempt starting.
EventProfileStart = "profile_start"
// EventProfileEnd marks a profile attempt ending.
EventProfileEnd = "profile_end"
)
var (
// ErrNoProfiles is returned when the supervisor is started without profiles.
ErrNoProfiles = errors.New("supervisor: no profiles configured")
// ErrMaxCyclesExceeded is returned after MaxCycles complete profile-list passes.
ErrMaxCyclesExceeded = errors.New("supervisor: max failover cycles exceeded")
errProfileCleanEnd = errors.New("profile ended")
)
// Profile is one runnable session configuration in an ordered failover list.
type Profile struct {
Name string
Config session.Config
}
// ProfileStatus summarizes one profile's failover history.
type ProfileStatus struct {
Name string
Starts int
Failures int
CleanEnds int
LastStarted time.Time
LastEnded time.Time
LastError string
}
// Event is one bounded failover history entry.
type Event struct {
Time time.Time
Type string
Profile string
Cycle int
Error string
}
// Status is a point-in-time view of the supervisor.
type Status struct {
Cycle int
ActiveProfile string
ActiveProfileIndex int
Profiles []ProfileStatus
History []Event
LastError string
}
// Runner starts one session profile and blocks until it ends or fails.
type Runner func(ctx context.Context, cfg session.Config) error
// Config controls ordered failover behavior.
type Config struct {
Profiles []Profile
RetryDelay time.Duration
MaxCycles int
OnProfileStart func(profile Profile, cycle int)
OnProfileEnd func(profile Profile, cycle int, err error)
OnStatus func(status Status)
HistoryLimit int
}
// Run starts profiles in order. If a profile exits while ctx is still active,
// the supervisor waits RetryDelay and advances to the next profile.
func Run(ctx context.Context, cfg Config, run Runner) error {
if len(cfg.Profiles) == 0 {
return ErrNoProfiles
}
if cfg.RetryDelay == 0 {
cfg.RetryDelay = DefaultRetryDelay
}
state := newStatusTracker(cfg.Profiles, cfg.HistoryLimit, cfg.OnStatus)
var lastErr error
for cycle := 1; ; cycle++ {
if err := runCycle(ctx, cfg, run, state, cycle, &lastErr); err != nil {
return err
}
if ctx.Err() != nil {
return nil //nolint:nilerr // context cancellation is normal supervisor shutdown
}
}
}
func runCycle(
ctx context.Context,
cfg Config,
run Runner,
state *statusTracker,
cycle int,
lastErr *error,
) error {
for i, profile := range cfg.Profiles {
if err := runProfile(ctx, cfg, run, state, cycle, i, profile, lastErr); err != nil {
return err
}
}
return nil
}
func runProfile(
ctx context.Context,
cfg Config,
run Runner,
state *statusTracker,
cycle int,
profileIndex int,
profile Profile,
lastErr *error,
) error {
if ctx.Err() != nil {
return nil //nolint:nilerr // context cancellation is normal supervisor shutdown
}
state.start(profileIndex, cycle)
if cfg.OnProfileStart != nil {
cfg.OnProfileStart(profile, cycle)
}
err := run(ctx, profile.Config)
if ctx.Err() != nil {
return nil //nolint:nilerr // context cancellation is normal supervisor shutdown
}
*lastErr = profileResultError(profile.Name, err)
state.end(profileIndex, cycle, err)
if cfg.OnProfileEnd != nil {
cfg.OnProfileEnd(profile, cycle, err)
}
if cfg.MaxCycles > 0 && cycle >= cfg.MaxCycles && profileIndex == len(cfg.Profiles)-1 {
return fmt.Errorf("%w after %d cycle(s): %w", ErrMaxCyclesExceeded, cycle, *lastErr)
}
if err := waitRetryDelay(ctx, cfg.RetryDelay); err != nil {
return nil //nolint:nilerr // context cancellation during retry delay is normal shutdown
}
return nil
}
func profileResultError(name string, err error) error {
if err != nil {
return fmt.Errorf("profile %q: %w", name, err)
}
return fmt.Errorf("profile %q: %w", name, errProfileCleanEnd)
}
type statusTracker struct {
status Status
notify func(Status)
historyLimit int
}
func newStatusTracker(profiles []Profile, historyLimit int, notify func(Status)) *statusTracker {
if historyLimit == 0 {
historyLimit = DefaultHistoryLimit
}
statusProfiles := make([]ProfileStatus, 0, len(profiles))
for _, profile := range profiles {
statusProfiles = append(statusProfiles, ProfileStatus{Name: profile.Name})
}
return &statusTracker{
status: Status{
ActiveProfileIndex: -1,
Profiles: statusProfiles,
},
notify: notify,
historyLimit: historyLimit,
}
}
func (t *statusTracker) start(profileIndex, cycle int) {
now := time.Now()
profile := &t.status.Profiles[profileIndex]
profile.Starts++
profile.LastStarted = now
t.status.Cycle = cycle
t.status.ActiveProfile = profile.Name
t.status.ActiveProfileIndex = profileIndex
t.appendHistory(Event{
Time: now,
Type: EventProfileStart,
Profile: profile.Name,
Cycle: cycle,
})
t.emit()
}
func (t *statusTracker) end(profileIndex, cycle int, err error) {
now := time.Now()
profile := &t.status.Profiles[profileIndex]
profile.LastEnded = now
event := Event{
Time: now,
Type: EventProfileEnd,
Profile: profile.Name,
Cycle: cycle,
}
if err != nil {
profile.Failures++
profile.LastError = err.Error()
t.status.LastError = fmt.Sprintf("profile %q: %v", profile.Name, err)
event.Error = err.Error()
} else {
profile.CleanEnds++
profile.LastError = ""
t.status.LastError = fmt.Sprintf("profile %q ended", profile.Name)
}
t.status.ActiveProfile = ""
t.status.ActiveProfileIndex = -1
t.appendHistory(event)
t.emit()
}
func (t *statusTracker) appendHistory(event Event) {
if t.historyLimit < 0 {
return
}
t.status.History = append(t.status.History, event)
if len(t.status.History) > t.historyLimit {
t.status.History = t.status.History[len(t.status.History)-t.historyLimit:]
}
}
func (t *statusTracker) emit() {
if t.notify == nil {
return
}
t.notify(cloneStatus(t.status))
}
func cloneStatus(status Status) Status {
status.Profiles = append([]ProfileStatus(nil), status.Profiles...)
status.History = append([]Event(nil), status.History...)
return status
}
func waitRetryDelay(ctx context.Context, delay time.Duration) error {
if delay <= 0 {
return nil
}
timer := time.NewTimer(delay)
defer timer.Stop()
select {
case <-ctx.Done():
return fmt.Errorf("retry delay canceled: %w", ctx.Err())
case <-timer.C:
return nil
}
}