ScuttleBot
feat: automatic stale agent cleanup with configurable reap_after_days - Add Reap() to registry — removes agents not seen in N days - Hourly reaper goroutine reads reap_after_days from agent policy - Revoked agents are reaped too if past the cutoff - Online agents are never reaped - Setting exposed in Settings UI → Agent Policy → "reap after days" - Default 0 (disabled) Closes #49
Commit
cd79584f050b25cb81b8c5dcd9b6d2d905c65824fc29038bf4d3a7633a9ff72f
Parent
66d18d7cb0c2b3f…
4 files changed
+20
+1
+7
+33
+20
| --- cmd/scuttlebot/main.go | ||
| +++ cmd/scuttlebot/main.go | ||
| @@ -294,10 +294,30 @@ | ||
| 294 | 294 | Config: b.Config, |
| 295 | 295 | } |
| 296 | 296 | } |
| 297 | 297 | botMgr.Sync(ctx, specs) |
| 298 | 298 | } |
| 299 | + | |
| 300 | + // Agent reaper — periodically removes stale agents based on policy. | |
| 301 | + go func() { | |
| 302 | + ticker := time.NewTicker(1 * time.Hour) | |
| 303 | + defer ticker.Stop() | |
| 304 | + for { | |
| 305 | + select { | |
| 306 | + case <-ctx.Done(): | |
| 307 | + return | |
| 308 | + case <-ticker.C: | |
| 309 | + p := policyStore.Get() | |
| 310 | + if p.AgentPolicy.ReapAfterDays > 0 { | |
| 311 | + maxAge := time.Duration(p.AgentPolicy.ReapAfterDays) * 24 * time.Hour | |
| 312 | + if n := reg.Reap(maxAge); n > 0 { | |
| 313 | + log.Info("reaped stale agents", "count", n, "max_age_days", p.AgentPolicy.ReapAfterDays) | |
| 314 | + } | |
| 315 | + } | |
| 316 | + } | |
| 317 | + } | |
| 318 | + }() | |
| 299 | 319 | |
| 300 | 320 | // Config store — owns write-back to scuttlebot.yaml with history snapshots. |
| 301 | 321 | cfgStore := api.NewConfigStore(*configPath, *cfg) |
| 302 | 322 | cfgStore.OnChange(func(updated config.Config) { |
| 303 | 323 | // Hot-reload topology on config change. |
| 304 | 324 |
| --- cmd/scuttlebot/main.go | |
| +++ cmd/scuttlebot/main.go | |
| @@ -294,10 +294,30 @@ | |
| 294 | Config: b.Config, |
| 295 | } |
| 296 | } |
| 297 | botMgr.Sync(ctx, specs) |
| 298 | } |
| 299 | |
| 300 | // Config store — owns write-back to scuttlebot.yaml with history snapshots. |
| 301 | cfgStore := api.NewConfigStore(*configPath, *cfg) |
| 302 | cfgStore.OnChange(func(updated config.Config) { |
| 303 | // Hot-reload topology on config change. |
| 304 |
| --- cmd/scuttlebot/main.go | |
| +++ cmd/scuttlebot/main.go | |
| @@ -294,10 +294,30 @@ | |
| 294 | Config: b.Config, |
| 295 | } |
| 296 | } |
| 297 | botMgr.Sync(ctx, specs) |
| 298 | } |
| 299 | |
| 300 | // Agent reaper — periodically removes stale agents based on policy. |
| 301 | go func() { |
| 302 | ticker := time.NewTicker(1 * time.Hour) |
| 303 | defer ticker.Stop() |
| 304 | for { |
| 305 | select { |
| 306 | case <-ctx.Done(): |
| 307 | return |
| 308 | case <-ticker.C: |
| 309 | p := policyStore.Get() |
| 310 | if p.AgentPolicy.ReapAfterDays > 0 { |
| 311 | maxAge := time.Duration(p.AgentPolicy.ReapAfterDays) * 24 * time.Hour |
| 312 | if n := reg.Reap(maxAge); n > 0 { |
| 313 | log.Info("reaped stale agents", "count", n, "max_age_days", p.AgentPolicy.ReapAfterDays) |
| 314 | } |
| 315 | } |
| 316 | } |
| 317 | } |
| 318 | }() |
| 319 | |
| 320 | // Config store — owns write-back to scuttlebot.yaml with history snapshots. |
| 321 | cfgStore := api.NewConfigStore(*configPath, *cfg) |
| 322 | cfgStore.OnChange(func(updated config.Config) { |
| 323 | // Hot-reload topology on config change. |
| 324 |
| --- internal/api/policies.go | ||
| +++ internal/api/policies.go | ||
| @@ -29,10 +29,11 @@ | ||
| 29 | 29 | type AgentPolicy struct { |
| 30 | 30 | RequireCheckin bool `json:"require_checkin"` |
| 31 | 31 | CheckinChannel string `json:"checkin_channel"` |
| 32 | 32 | RequiredChannels []string `json:"required_channels"` |
| 33 | 33 | OnlineTimeoutSecs int `json:"online_timeout_secs,omitempty"` |
| 34 | + ReapAfterDays int `json:"reap_after_days,omitempty"` | |
| 34 | 35 | } |
| 35 | 36 | |
| 36 | 37 | // LoggingPolicy configures message logging. |
| 37 | 38 | type LoggingPolicy struct { |
| 38 | 39 | Enabled bool `json:"enabled"` |
| 39 | 40 |
| --- internal/api/policies.go | |
| +++ internal/api/policies.go | |
| @@ -29,10 +29,11 @@ | |
| 29 | type AgentPolicy struct { |
| 30 | RequireCheckin bool `json:"require_checkin"` |
| 31 | CheckinChannel string `json:"checkin_channel"` |
| 32 | RequiredChannels []string `json:"required_channels"` |
| 33 | OnlineTimeoutSecs int `json:"online_timeout_secs,omitempty"` |
| 34 | } |
| 35 | |
| 36 | // LoggingPolicy configures message logging. |
| 37 | type LoggingPolicy struct { |
| 38 | Enabled bool `json:"enabled"` |
| 39 |
| --- internal/api/policies.go | |
| +++ internal/api/policies.go | |
| @@ -29,10 +29,11 @@ | |
| 29 | type AgentPolicy struct { |
| 30 | RequireCheckin bool `json:"require_checkin"` |
| 31 | CheckinChannel string `json:"checkin_channel"` |
| 32 | RequiredChannels []string `json:"required_channels"` |
| 33 | OnlineTimeoutSecs int `json:"online_timeout_secs,omitempty"` |
| 34 | ReapAfterDays int `json:"reap_after_days,omitempty"` |
| 35 | } |
| 36 | |
| 37 | // LoggingPolicy configures message logging. |
| 38 | type LoggingPolicy struct { |
| 39 | Enabled bool `json:"enabled"` |
| 40 |
| --- internal/api/ui/index.html | ||
| +++ internal/api/ui/index.html | ||
| @@ -605,10 +605,15 @@ | ||
| 605 | 605 | <div class="setting-row"> |
| 606 | 606 | <div class="setting-label">online timeout</div> |
| 607 | 607 | <div class="setting-desc">Seconds since last heartbeat before an agent is considered offline. Default: 120.</div> |
| 608 | 608 | <input type="number" id="policy-online-timeout" placeholder="120" min="10" max="3600" style="width:100px;padding:4px 8px;font-size:12px"> |
| 609 | 609 | </div> |
| 610 | + <div class="setting-row"> | |
| 611 | + <div class="setting-label">reap after days</div> | |
| 612 | + <div class="setting-desc">Remove stale agents not seen in this many days. 0 = never reap.</div> | |
| 613 | + <input type="number" id="policy-reap-days" placeholder="0" min="0" max="365" style="width:100px;padding:4px 8px;font-size:12px"> | |
| 614 | + </div> | |
| 610 | 615 | </div> |
| 611 | 616 | <div id="agentpolicy-save-result" style="display:none;margin:0 16px 12px"></div> |
| 612 | 617 | </div> |
| 613 | 618 | |
| 614 | 619 | <!-- bridge --> |
| @@ -2767,10 +2772,11 @@ | ||
| 2767 | 2772 | function renderAgentPolicy(p) { |
| 2768 | 2773 | document.getElementById('policy-checkin-enabled').checked = !!p.require_checkin; |
| 2769 | 2774 | document.getElementById('policy-checkin-channel').value = p.checkin_channel || ''; |
| 2770 | 2775 | document.getElementById('policy-required-channels').value = (p.required_channels||[]).join(', '); |
| 2771 | 2776 | document.getElementById('policy-online-timeout').value = p.online_timeout_secs || ''; |
| 2777 | + document.getElementById('policy-reap-days').value = p.reap_after_days || ''; | |
| 2772 | 2778 | toggleCheckinChannel(); |
| 2773 | 2779 | } |
| 2774 | 2780 | function toggleCheckinChannel() { |
| 2775 | 2781 | const on = document.getElementById('policy-checkin-enabled').checked; |
| 2776 | 2782 | document.getElementById('policy-checkin-row').style.display = on ? '' : 'none'; |
| @@ -3024,10 +3030,11 @@ | ||
| 3024 | 3030 | agent_policy: { |
| 3025 | 3031 | require_checkin: document.getElementById('policy-checkin-enabled').checked, |
| 3026 | 3032 | checkin_channel: document.getElementById('policy-checkin-channel').value.trim(), |
| 3027 | 3033 | required_channels: document.getElementById('policy-required-channels').value.split(',').map(s=>s.trim()).filter(Boolean), |
| 3028 | 3034 | online_timeout_secs: parseInt(document.getElementById('policy-online-timeout').value) || 0, |
| 3035 | + reap_after_days: parseInt(document.getElementById('policy-reap-days').value) || 0, | |
| 3029 | 3036 | } |
| 3030 | 3037 | }, 'agentpolicy-save-result'); |
| 3031 | 3038 | } |
| 3032 | 3039 | |
| 3033 | 3040 | function saveBridgeConfig() { |
| 3034 | 3041 |
| --- internal/api/ui/index.html | |
| +++ internal/api/ui/index.html | |
| @@ -605,10 +605,15 @@ | |
| 605 | <div class="setting-row"> |
| 606 | <div class="setting-label">online timeout</div> |
| 607 | <div class="setting-desc">Seconds since last heartbeat before an agent is considered offline. Default: 120.</div> |
| 608 | <input type="number" id="policy-online-timeout" placeholder="120" min="10" max="3600" style="width:100px;padding:4px 8px;font-size:12px"> |
| 609 | </div> |
| 610 | </div> |
| 611 | <div id="agentpolicy-save-result" style="display:none;margin:0 16px 12px"></div> |
| 612 | </div> |
| 613 | |
| 614 | <!-- bridge --> |
| @@ -2767,10 +2772,11 @@ | |
| 2767 | function renderAgentPolicy(p) { |
| 2768 | document.getElementById('policy-checkin-enabled').checked = !!p.require_checkin; |
| 2769 | document.getElementById('policy-checkin-channel').value = p.checkin_channel || ''; |
| 2770 | document.getElementById('policy-required-channels').value = (p.required_channels||[]).join(', '); |
| 2771 | document.getElementById('policy-online-timeout').value = p.online_timeout_secs || ''; |
| 2772 | toggleCheckinChannel(); |
| 2773 | } |
| 2774 | function toggleCheckinChannel() { |
| 2775 | const on = document.getElementById('policy-checkin-enabled').checked; |
| 2776 | document.getElementById('policy-checkin-row').style.display = on ? '' : 'none'; |
| @@ -3024,10 +3030,11 @@ | |
| 3024 | agent_policy: { |
| 3025 | require_checkin: document.getElementById('policy-checkin-enabled').checked, |
| 3026 | checkin_channel: document.getElementById('policy-checkin-channel').value.trim(), |
| 3027 | required_channels: document.getElementById('policy-required-channels').value.split(',').map(s=>s.trim()).filter(Boolean), |
| 3028 | online_timeout_secs: parseInt(document.getElementById('policy-online-timeout').value) || 0, |
| 3029 | } |
| 3030 | }, 'agentpolicy-save-result'); |
| 3031 | } |
| 3032 | |
| 3033 | function saveBridgeConfig() { |
| 3034 |
| --- internal/api/ui/index.html | |
| +++ internal/api/ui/index.html | |
| @@ -605,10 +605,15 @@ | |
| 605 | <div class="setting-row"> |
| 606 | <div class="setting-label">online timeout</div> |
| 607 | <div class="setting-desc">Seconds since last heartbeat before an agent is considered offline. Default: 120.</div> |
| 608 | <input type="number" id="policy-online-timeout" placeholder="120" min="10" max="3600" style="width:100px;padding:4px 8px;font-size:12px"> |
| 609 | </div> |
| 610 | <div class="setting-row"> |
| 611 | <div class="setting-label">reap after days</div> |
| 612 | <div class="setting-desc">Remove stale agents not seen in this many days. 0 = never reap.</div> |
| 613 | <input type="number" id="policy-reap-days" placeholder="0" min="0" max="365" style="width:100px;padding:4px 8px;font-size:12px"> |
| 614 | </div> |
| 615 | </div> |
| 616 | <div id="agentpolicy-save-result" style="display:none;margin:0 16px 12px"></div> |
| 617 | </div> |
| 618 | |
| 619 | <!-- bridge --> |
| @@ -2767,10 +2772,11 @@ | |
| 2772 | function renderAgentPolicy(p) { |
| 2773 | document.getElementById('policy-checkin-enabled').checked = !!p.require_checkin; |
| 2774 | document.getElementById('policy-checkin-channel').value = p.checkin_channel || ''; |
| 2775 | document.getElementById('policy-required-channels').value = (p.required_channels||[]).join(', '); |
| 2776 | document.getElementById('policy-online-timeout').value = p.online_timeout_secs || ''; |
| 2777 | document.getElementById('policy-reap-days').value = p.reap_after_days || ''; |
| 2778 | toggleCheckinChannel(); |
| 2779 | } |
| 2780 | function toggleCheckinChannel() { |
| 2781 | const on = document.getElementById('policy-checkin-enabled').checked; |
| 2782 | document.getElementById('policy-checkin-row').style.display = on ? '' : 'none'; |
| @@ -3024,10 +3030,11 @@ | |
| 3030 | agent_policy: { |
| 3031 | require_checkin: document.getElementById('policy-checkin-enabled').checked, |
| 3032 | checkin_channel: document.getElementById('policy-checkin-channel').value.trim(), |
| 3033 | required_channels: document.getElementById('policy-required-channels').value.split(',').map(s=>s.trim()).filter(Boolean), |
| 3034 | online_timeout_secs: parseInt(document.getElementById('policy-online-timeout').value) || 0, |
| 3035 | reap_after_days: parseInt(document.getElementById('policy-reap-days').value) || 0, |
| 3036 | } |
| 3037 | }, 'agentpolicy-save-result'); |
| 3038 | } |
| 3039 | |
| 3040 | function saveBridgeConfig() { |
| 3041 |
| --- internal/registry/registry.go | ||
| +++ internal/registry/registry.go | ||
| @@ -407,10 +407,43 @@ | ||
| 407 | 407 | if r.onlineTimeout > 0 { |
| 408 | 408 | return r.onlineTimeout |
| 409 | 409 | } |
| 410 | 410 | return defaultOnlineTimeout |
| 411 | 411 | } |
| 412 | + | |
| 413 | +// Reap removes agents that haven't been seen in maxAge. Revoked agents | |
| 414 | +// are always reaped if older than maxAge. Returns the number of agents removed. | |
| 415 | +func (r *Registry) Reap(maxAge time.Duration) int { | |
| 416 | + if maxAge <= 0 { | |
| 417 | + return 0 | |
| 418 | + } | |
| 419 | + r.mu.Lock() | |
| 420 | + defer r.mu.Unlock() | |
| 421 | + cutoff := time.Now().Add(-maxAge) | |
| 422 | + var reaped int | |
| 423 | + for nick, a := range r.agents { | |
| 424 | + if a.Online { | |
| 425 | + continue | |
| 426 | + } | |
| 427 | + // Use last_seen if available, otherwise fall back to created_at. | |
| 428 | + ref := a.CreatedAt | |
| 429 | + if a.LastSeen != nil { | |
| 430 | + ref = *a.LastSeen | |
| 431 | + } | |
| 432 | + if ref.Before(cutoff) { | |
| 433 | + delete(r.agents, nick) | |
| 434 | + if r.db != nil { | |
| 435 | + _ = r.db.AgentDelete(nick) | |
| 436 | + } | |
| 437 | + reaped++ | |
| 438 | + } | |
| 439 | + } | |
| 440 | + if reaped > 0 && r.db == nil { | |
| 441 | + r.save() | |
| 442 | + } | |
| 443 | + return reaped | |
| 444 | +} | |
| 412 | 445 | |
| 413 | 446 | // List returns all registered agents with computed online status. |
| 414 | 447 | func (r *Registry) List() []*Agent { |
| 415 | 448 | r.mu.RLock() |
| 416 | 449 | defer r.mu.RUnlock() |
| 417 | 450 |
| --- internal/registry/registry.go | |
| +++ internal/registry/registry.go | |
| @@ -407,10 +407,43 @@ | |
| 407 | if r.onlineTimeout > 0 { |
| 408 | return r.onlineTimeout |
| 409 | } |
| 410 | return defaultOnlineTimeout |
| 411 | } |
| 412 | |
| 413 | // List returns all registered agents with computed online status. |
| 414 | func (r *Registry) List() []*Agent { |
| 415 | r.mu.RLock() |
| 416 | defer r.mu.RUnlock() |
| 417 |
| --- internal/registry/registry.go | |
| +++ internal/registry/registry.go | |
| @@ -407,10 +407,43 @@ | |
| 407 | if r.onlineTimeout > 0 { |
| 408 | return r.onlineTimeout |
| 409 | } |
| 410 | return defaultOnlineTimeout |
| 411 | } |
| 412 | |
| 413 | // Reap removes agents that haven't been seen in maxAge. Revoked agents |
| 414 | // are always reaped if older than maxAge. Returns the number of agents removed. |
| 415 | func (r *Registry) Reap(maxAge time.Duration) int { |
| 416 | if maxAge <= 0 { |
| 417 | return 0 |
| 418 | } |
| 419 | r.mu.Lock() |
| 420 | defer r.mu.Unlock() |
| 421 | cutoff := time.Now().Add(-maxAge) |
| 422 | var reaped int |
| 423 | for nick, a := range r.agents { |
| 424 | if a.Online { |
| 425 | continue |
| 426 | } |
| 427 | // Use last_seen if available, otherwise fall back to created_at. |
| 428 | ref := a.CreatedAt |
| 429 | if a.LastSeen != nil { |
| 430 | ref = *a.LastSeen |
| 431 | } |
| 432 | if ref.Before(cutoff) { |
| 433 | delete(r.agents, nick) |
| 434 | if r.db != nil { |
| 435 | _ = r.db.AgentDelete(nick) |
| 436 | } |
| 437 | reaped++ |
| 438 | } |
| 439 | } |
| 440 | if reaped > 0 && r.db == nil { |
| 441 | r.save() |
| 442 | } |
| 443 | return reaped |
| 444 | } |
| 445 | |
| 446 | // List returns all registered agents with computed online status. |
| 447 | func (r *Registry) List() []*Agent { |
| 448 | r.mu.RLock() |
| 449 | defer r.mu.RUnlock() |
| 450 |