ScuttleBot

scuttlebot / cmd / relay-watchdog / main.go
Blame History Raw 154 lines
1
// relay-watchdog monitors a scuttlebot server and signals relay processes
2
// to reconnect when the server restarts or becomes unreachable.
3
//
4
// Usage: relay-watchdog --url https://irc.scuttlebot.net --token <token> --signal <pid>
5
//
6
// It polls the server's /v1/status endpoint every 10 seconds. When the
7
// server's start time changes (restart) or the API is unreachable for 60
8
// seconds (network issue), it sends SIGUSR1 to the specified PID (or all
9
// relay processes if --signal 0).
10
package main
11
12
import (
13
"encoding/json"
14
"flag"
15
"fmt"
16
"net/http"
17
"os"
18
"os/exec"
19
"os/signal"
20
"strings"
21
"syscall"
22
"time"
23
)
24
25
func loadEnvFile(path string) {
26
data, err := os.ReadFile(path)
27
if err != nil {
28
return
29
}
30
for _, line := range strings.Split(string(data), "\n") {
31
line = strings.TrimSpace(line)
32
if line == "" || strings.HasPrefix(line, "#") {
33
continue
34
}
35
if k, v, ok := strings.Cut(line, "="); ok {
36
k = strings.TrimSpace(k)
37
v = strings.TrimSpace(v)
38
if os.Getenv(k) == "" { // don't override explicit env
39
os.Setenv(k, v)
40
}
41
}
42
}
43
}
44
45
func main() {
46
// Load the shared relay config.
47
home, _ := os.UserHomeDir()
48
if home != "" {
49
loadEnvFile(home + "/.config/scuttlebot-relay.env")
50
}
51
52
url := flag.String("url", os.Getenv("SCUTTLEBOT_URL"), "scuttlebot API URL")
53
token := flag.String("token", os.Getenv("SCUTTLEBOT_TOKEN"), "API token")
54
interval := flag.Duration("interval", 10*time.Second, "poll interval")
55
flag.Parse()
56
57
if *url == "" || *token == "" {
58
fmt.Fprintf(os.Stderr, "relay-watchdog: SCUTTLEBOT_URL and SCUTTLEBOT_TOKEN required\n")
59
os.Exit(1)
60
}
61
62
// Handle graceful shutdown.
63
sigCh := make(chan os.Signal, 1)
64
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
65
66
var lastStart string
67
failures := 0
68
client := &http.Client{Timeout: 5 * time.Second}
69
70
fmt.Fprintf(os.Stderr, "relay-watchdog: monitoring %s every %s\n", *url, *interval)
71
72
ticker := time.NewTicker(*interval)
73
defer ticker.Stop()
74
75
for {
76
select {
77
case <-sigCh:
78
fmt.Fprintf(os.Stderr, "relay-watchdog: shutting down\n")
79
return
80
case <-ticker.C:
81
}
82
83
start := getStart(client, *url, *token)
84
if start == "" {
85
failures++
86
fmt.Fprintf(os.Stderr, "relay-watchdog: API unreachable (%d)\n", failures)
87
if failures >= 6 { // 60s at 10s interval
88
fmt.Fprintf(os.Stderr, "relay-watchdog: extended outage, will signal relays on recovery\n")
89
}
90
continue
91
}
92
93
if failures >= 6 {
94
// We were down for a while and just came back.
95
fmt.Fprintf(os.Stderr, "relay-watchdog: API recovered after %d failures, killing relays\n", failures)
96
killRelays()
97
lastStart = start
98
failures = 0
99
continue
100
}
101
102
if lastStart == "" {
103
lastStart = start
104
failures = 0
105
continue
106
}
107
108
if start != lastStart {
109
fmt.Fprintf(os.Stderr, "relay-watchdog: server restarted (was %s, now %s), killing relays\n", lastStart, start)
110
killRelays()
111
lastStart = start
112
}
113
failures = 0
114
}
115
}
116
117
func getStart(client *http.Client, url, token string) string {
118
req, err := http.NewRequest(http.MethodGet, url+"/v1/status", nil)
119
if err != nil {
120
return ""
121
}
122
req.Header.Set("Authorization", "Bearer "+token)
123
resp, err := client.Do(req)
124
if err != nil {
125
return ""
126
}
127
defer resp.Body.Close()
128
var s struct {
129
Started string `json:"started"`
130
}
131
_ = json.NewDecoder(resp.Body).Decode(&s)
132
return s.Started
133
}
134
135
func killRelays() {
136
// Find relay processes and send SIGUSR1 to trigger IRC reconnection.
137
// The relay handles SIGUSR1 by tearing down and rebuilding the IRC
138
// connection without killing the Claude subprocess.
139
out, err := exec.Command("pgrep", "-f", "(claude|codex|gemini)-relay").Output()
140
if err != nil {
141
fmt.Fprintf(os.Stderr, "relay-watchdog: no relay processes found\n")
142
return
143
}
144
pids := strings.Fields(strings.TrimSpace(string(out)))
145
myPid := fmt.Sprintf("%d", os.Getpid())
146
for _, pid := range pids {
147
if pid == myPid {
148
continue
149
}
150
fmt.Fprintf(os.Stderr, "relay-watchdog: signaling relay pid %s (SIGUSR1)\n", pid)
151
_ = exec.Command("kill", "-USR1", pid).Run()
152
}
153
}
154

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button