Fuehre Offline-Schwelle fuer Server-Connectivity ein

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-03-22 18:25:01 +01:00 · 2026-03-22 18:25:01 +01:00 · a69135c0b9
commit a69135c0b9
parent 2c780d3e60
4 changed files with 76 additions and 8 deletions
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@ -187,7 +187,7 @@ go run ./cmd/agent
 1. Backend: einheitliches Fehlerformat und Routing-Grundstruktur anlegen
 2. Backend: Konfigurations- und App-Lifecycle stabilisieren
 3. Agent und Backend: den HTTP-Statuspfad als Grundlage fuer Identitaet, Persistenz und spaetere Admin-Vorschau erweitern
-4. Agent: danach einen expliziten `offline`-Zustand und weitere Connectivity-Schwellenlogik aufsetzen
+4. Agent: danach MQTT-spezifische Reachability und feinere Connectivity-Schwellenlogik aufsetzen
 5. Danach die Netzwerk-, Sync- und Kommandopfade schrittweise produktionsnah ausbauen

 Ergaenzt seit dem ersten Geruest:
@ -199,7 +199,7 @@ Ergaenzt seit dem ersten Geruest:
 - dateibasierte Agent-Konfiguration zusaetzlich zu Env-Overrides
 - strukturierte Agent-Logs mit internem Health-Snapshot und signalgesteuertem Shutdown
 - erster periodischer HTTP-Status-Reporter im Agent
- Server-Connectivity-Zustand im Agent (`unknown`, `online`, `degraded`) auf Basis der Report-Ergebnisse
+- Server-Connectivity-Zustand im Agent (`unknown`, `online`, `degraded`, `offline`) auf Basis der Report-Ergebnisse
 - lokales Compose-Grundgeruest fuer PostgreSQL und Mosquitto

 ## Arbeitsweise
--- a/docs/PLAYER-AGENT-LIFECYCLE.md
+++ b/docs/PLAYER-AGENT-LIFECYCLE.md
@ -47,10 +47,17 @@ Getrennt vom Lifecycle fuehrt der Agent fuer die Server-Erreichbarkeit aktuell d
 - `unknown` vor dem ersten erfolgreichen oder fehlgeschlagenen Status-Report
 - `online` nach einem erfolgreich bestaetigten HTTP-Status-Report
 - `degraded` nach einem fehlgeschlagenen HTTP-Status-Report
+- `offline` nach wiederholten fehlgeschlagenen HTTP-Status-Reports

 Damit bleibt der Lifecycle sauber von Netz- und Gegenstellenproblemen getrennt.
 Ein Report-Fehler stoppt den Agenten nicht, sondern veraendert nur den Connectivity-Zustand.

+Aktuell gilt fuer diese Schwellenlogik bewusst einfach:
+
+- erster Fehl-Report: `degraded`
+- ab dem dritten aufeinanderfolgenden Fehl-Report: `offline`
+- naechster erfolgreicher Report: Rueckkehr nach `online`
+
 ## Strukturierte Log-Ereignisse

 Der Agent emittiert in v1 mindestens diese Ereignisse:
@ -88,6 +95,6 @@ Nicht Teil dieser Stufe:
 - Kommandos oder Sync-Status

 Die erste Backend-Reachability-Pruefung ist in dieser Stufe bereits ueber den HTTP-Status-Report abgebildet.
-Ein expliziter `offline`-Zustand, MQTT-Reachability und weitergehende Schwellenlogik folgen spaeter.
+MQTT-Reachability und weitergehende Schwellenlogik folgen spaeter.

 Diese Punkte folgen erst, wenn echte Netzwerk- und Sync-Funktionalitaet eingebaut wird.
--- a/player/agent/internal/app/app.go
+++ b/player/agent/internal/app/app.go
@ -24,8 +24,11 @@ const (
 	ConnectivityUnknown  Connectivity = "unknown"
 	ConnectivityOnline   Connectivity = "online"
 	ConnectivityDegraded Connectivity = "degraded"
+	ConnectivityOffline  Connectivity = "offline"
 )

+const offlineFailureThreshold = 3
+
 type HealthSnapshot struct {
 	Status             Status
 	ServerConnectivity Connectivity
@ -46,6 +49,7 @@ type App struct {
 	mu                        sync.RWMutex
 	status                    Status
 	serverConnectivity        Connectivity
+	consecutiveReportFailures int
 	startedAt                 time.Time
 	lastHeartbeatAt           time.Time
 }
@ -181,13 +185,18 @@ func (a *App) reportStatus(ctx context.Context) {
 	})
 	if err != nil {
 		a.mu.Lock()
+		a.consecutiveReportFailures++
 		a.serverConnectivity = ConnectivityDegraded
+		if a.consecutiveReportFailures >= offlineFailureThreshold {
+			a.serverConnectivity = ConnectivityOffline
+		}
 		a.mu.Unlock()
 		a.logger.Printf("event=status_report_failed screen_id=%s error=%v", a.Config.ScreenID, err)
 		return
 	}

 	a.mu.Lock()
+	a.consecutiveReportFailures = 0
 	a.serverConnectivity = ConnectivityOnline
 	a.mu.Unlock()
 	a.logger.Printf("event=status_report_sent screen_id=%s", a.Config.ScreenID)
--- a/player/agent/internal/app/app_test.go
+++ b/player/agent/internal/app/app_test.go
@ -15,10 +15,16 @@ import (
 type recordingReporter struct {
 	callCount int
 	err       error
+	errs      []error
 }

 func (r *recordingReporter) Send(_ context.Context, _ statusreporter.Snapshot) error {
 	r.callCount++
+	if len(r.errs) > 0 {
+		err := r.errs[0]
+		r.errs = r.errs[1:]
+		return err
+	}
 	return r.err
 }

@ -236,3 +242,49 @@ func TestAppRunMarksServerConnectivityOnlineAfterSuccessfulReport(t *testing.T)
 	cancel()
 	<-errCh
 }
+
+func TestReportStatusMarksServerConnectivityOfflineAfterRepeatedFailures(t *testing.T) {
+	reporter := &recordingReporter{err: context.DeadlineExceeded}
+	application := newApp(config.Config{
+		ScreenID:          "screen-offline",
+		ServerBaseURL:     "http://127.0.0.1:8080",
+		MQTTBroker:        "tcp://127.0.0.1:1883",
+		HeartbeatEvery:    30,
+		StatusReportEvery: 30,
+	}, log.New(&bytes.Buffer{}, "", 0), time.Now, reporter)
+
+	application.reportStatus(context.Background())
+	if got, want := application.Snapshot().ServerConnectivity, ConnectivityDegraded; got != want {
+		t.Fatalf("after first failure ServerConnectivity = %q, want %q", got, want)
+	}
+
+	application.reportStatus(context.Background())
+	application.reportStatus(context.Background())
+
+	if got, want := application.Snapshot().ServerConnectivity, ConnectivityOffline; got != want {
+		t.Fatalf("after repeated failures ServerConnectivity = %q, want %q", got, want)
+	}
+}
+
+func TestReportStatusRecoversFromOfflineToOnline(t *testing.T) {
+	reporter := &recordingReporter{errs: []error{context.DeadlineExceeded, context.DeadlineExceeded, context.DeadlineExceeded, nil}}
+	application := newApp(config.Config{
+		ScreenID:          "screen-recover",
+		ServerBaseURL:     "http://127.0.0.1:8080",
+		MQTTBroker:        "tcp://127.0.0.1:1883",
+		HeartbeatEvery:    30,
+		StatusReportEvery: 30,
+	}, log.New(&bytes.Buffer{}, "", 0), time.Now, reporter)
+
+	application.reportStatus(context.Background())
+	application.reportStatus(context.Background())
+	application.reportStatus(context.Background())
+	if got, want := application.Snapshot().ServerConnectivity, ConnectivityOffline; got != want {
+		t.Fatalf("offline state = %q, want %q", got, want)
+	}
+
+	application.reportStatus(context.Background())
+	if got, want := application.Snapshot().ServerConnectivity, ConnectivityOnline; got != want {
+		t.Fatalf("recovered state = %q, want %q", got, want)
+	}
+}