|
9f5df4d…
|
lmata
|
1 |
// relay-watchdog monitors a scuttlebot server and signals relay processes |
|
9f5df4d…
|
lmata
|
2 |
// to reconnect when the server restarts or becomes unreachable. |
|
9f5df4d…
|
lmata
|
3 |
// |
|
9f5df4d…
|
lmata
|
4 |
// Usage: relay-watchdog --url https://irc.scuttlebot.net --token <token> --signal <pid> |
|
9f5df4d…
|
lmata
|
5 |
// |
|
9f5df4d…
|
lmata
|
6 |
// It polls the server's /v1/status endpoint every 10 seconds. When the |
|
9f5df4d…
|
lmata
|
7 |
// server's start time changes (restart) or the API is unreachable for 60 |
|
9f5df4d…
|
lmata
|
8 |
// seconds (network issue), it sends SIGUSR1 to the specified PID (or all |
|
9f5df4d…
|
lmata
|
9 |
// relay processes if --signal 0). |
|
9f5df4d…
|
lmata
|
10 |
package main |
|
9f5df4d…
|
lmata
|
11 |
|
|
9f5df4d…
|
lmata
|
12 |
import ( |
|
9f5df4d…
|
lmata
|
13 |
"encoding/json" |
|
9f5df4d…
|
lmata
|
14 |
"flag" |
|
9f5df4d…
|
lmata
|
15 |
"fmt" |
|
9f5df4d…
|
lmata
|
16 |
"net/http" |
|
9f5df4d…
|
lmata
|
17 |
"os" |
|
9f5df4d…
|
lmata
|
18 |
"os/exec" |
|
9f5df4d…
|
lmata
|
19 |
"os/signal" |
|
9f5df4d…
|
lmata
|
20 |
"strings" |
|
9f5df4d…
|
lmata
|
21 |
"syscall" |
|
9f5df4d…
|
lmata
|
22 |
"time" |
|
9f5df4d…
|
lmata
|
23 |
) |
|
9f5df4d…
|
lmata
|
24 |
|
|
9f5df4d…
|
lmata
|
25 |
func loadEnvFile(path string) { |
|
9f5df4d…
|
lmata
|
26 |
data, err := os.ReadFile(path) |
|
9f5df4d…
|
lmata
|
27 |
if err != nil { |
|
9f5df4d…
|
lmata
|
28 |
return |
|
9f5df4d…
|
lmata
|
29 |
} |
|
9f5df4d…
|
lmata
|
30 |
for _, line := range strings.Split(string(data), "\n") { |
|
9f5df4d…
|
lmata
|
31 |
line = strings.TrimSpace(line) |
|
9f5df4d…
|
lmata
|
32 |
if line == "" || strings.HasPrefix(line, "#") { |
|
9f5df4d…
|
lmata
|
33 |
continue |
|
9f5df4d…
|
lmata
|
34 |
} |
|
9f5df4d…
|
lmata
|
35 |
if k, v, ok := strings.Cut(line, "="); ok { |
|
9f5df4d…
|
lmata
|
36 |
k = strings.TrimSpace(k) |
|
9f5df4d…
|
lmata
|
37 |
v = strings.TrimSpace(v) |
|
9f5df4d…
|
lmata
|
38 |
if os.Getenv(k) == "" { // don't override explicit env |
|
9f5df4d…
|
lmata
|
39 |
os.Setenv(k, v) |
|
9f5df4d…
|
lmata
|
40 |
} |
|
9f5df4d…
|
lmata
|
41 |
} |
|
9f5df4d…
|
lmata
|
42 |
} |
|
9f5df4d…
|
lmata
|
43 |
} |
|
9f5df4d…
|
lmata
|
44 |
|
|
9f5df4d…
|
lmata
|
45 |
func main() { |
|
9f5df4d…
|
lmata
|
46 |
// Load the shared relay config. |
|
9f5df4d…
|
lmata
|
47 |
home, _ := os.UserHomeDir() |
|
9f5df4d…
|
lmata
|
48 |
if home != "" { |
|
9f5df4d…
|
lmata
|
49 |
loadEnvFile(home + "/.config/scuttlebot-relay.env") |
|
9f5df4d…
|
lmata
|
50 |
} |
|
9f5df4d…
|
lmata
|
51 |
|
|
9f5df4d…
|
lmata
|
52 |
url := flag.String("url", os.Getenv("SCUTTLEBOT_URL"), "scuttlebot API URL") |
|
9f5df4d…
|
lmata
|
53 |
token := flag.String("token", os.Getenv("SCUTTLEBOT_TOKEN"), "API token") |
|
9f5df4d…
|
lmata
|
54 |
interval := flag.Duration("interval", 10*time.Second, "poll interval") |
|
9f5df4d…
|
lmata
|
55 |
flag.Parse() |
|
9f5df4d…
|
lmata
|
56 |
|
|
9f5df4d…
|
lmata
|
57 |
if *url == "" || *token == "" { |
|
9f5df4d…
|
lmata
|
58 |
fmt.Fprintf(os.Stderr, "relay-watchdog: SCUTTLEBOT_URL and SCUTTLEBOT_TOKEN required\n") |
|
9f5df4d…
|
lmata
|
59 |
os.Exit(1) |
|
9f5df4d…
|
lmata
|
60 |
} |
|
9f5df4d…
|
lmata
|
61 |
|
|
9f5df4d…
|
lmata
|
62 |
// Handle graceful shutdown. |
|
9f5df4d…
|
lmata
|
63 |
sigCh := make(chan os.Signal, 1) |
|
9f5df4d…
|
lmata
|
64 |
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) |
|
9f5df4d…
|
lmata
|
65 |
|
|
9f5df4d…
|
lmata
|
66 |
var lastStart string |
|
9f5df4d…
|
lmata
|
67 |
failures := 0 |
|
9f5df4d…
|
lmata
|
68 |
client := &http.Client{Timeout: 5 * time.Second} |
|
9f5df4d…
|
lmata
|
69 |
|
|
9f5df4d…
|
lmata
|
70 |
fmt.Fprintf(os.Stderr, "relay-watchdog: monitoring %s every %s\n", *url, *interval) |
|
9f5df4d…
|
lmata
|
71 |
|
|
9f5df4d…
|
lmata
|
72 |
ticker := time.NewTicker(*interval) |
|
9f5df4d…
|
lmata
|
73 |
defer ticker.Stop() |
|
9f5df4d…
|
lmata
|
74 |
|
|
9f5df4d…
|
lmata
|
75 |
for { |
|
9f5df4d…
|
lmata
|
76 |
select { |
|
9f5df4d…
|
lmata
|
77 |
case <-sigCh: |
|
9f5df4d…
|
lmata
|
78 |
fmt.Fprintf(os.Stderr, "relay-watchdog: shutting down\n") |
|
9f5df4d…
|
lmata
|
79 |
return |
|
9f5df4d…
|
lmata
|
80 |
case <-ticker.C: |
|
9f5df4d…
|
lmata
|
81 |
} |
|
9f5df4d…
|
lmata
|
82 |
|
|
9f5df4d…
|
lmata
|
83 |
start := getStart(client, *url, *token) |
|
9f5df4d…
|
lmata
|
84 |
if start == "" { |
|
9f5df4d…
|
lmata
|
85 |
failures++ |
|
9f5df4d…
|
lmata
|
86 |
fmt.Fprintf(os.Stderr, "relay-watchdog: API unreachable (%d)\n", failures) |
|
9f5df4d…
|
lmata
|
87 |
if failures >= 6 { // 60s at 10s interval |
|
9f5df4d…
|
lmata
|
88 |
fmt.Fprintf(os.Stderr, "relay-watchdog: extended outage, will signal relays on recovery\n") |
|
9f5df4d…
|
lmata
|
89 |
} |
|
9f5df4d…
|
lmata
|
90 |
continue |
|
9f5df4d…
|
lmata
|
91 |
} |
|
9f5df4d…
|
lmata
|
92 |
|
|
9f5df4d…
|
lmata
|
93 |
if failures >= 6 { |
|
9f5df4d…
|
lmata
|
94 |
// We were down for a while and just came back. |
|
9f5df4d…
|
lmata
|
95 |
fmt.Fprintf(os.Stderr, "relay-watchdog: API recovered after %d failures, killing relays\n", failures) |
|
9f5df4d…
|
lmata
|
96 |
killRelays() |
|
9f5df4d…
|
lmata
|
97 |
lastStart = start |
|
9f5df4d…
|
lmata
|
98 |
failures = 0 |
|
9f5df4d…
|
lmata
|
99 |
continue |
|
9f5df4d…
|
lmata
|
100 |
} |
|
9f5df4d…
|
lmata
|
101 |
|
|
9f5df4d…
|
lmata
|
102 |
if lastStart == "" { |
|
9f5df4d…
|
lmata
|
103 |
lastStart = start |
|
9f5df4d…
|
lmata
|
104 |
failures = 0 |
|
9f5df4d…
|
lmata
|
105 |
continue |
|
9f5df4d…
|
lmata
|
106 |
} |
|
9f5df4d…
|
lmata
|
107 |
|
|
9f5df4d…
|
lmata
|
108 |
if start != lastStart { |
|
9f5df4d…
|
lmata
|
109 |
fmt.Fprintf(os.Stderr, "relay-watchdog: server restarted (was %s, now %s), killing relays\n", lastStart, start) |
|
9f5df4d…
|
lmata
|
110 |
killRelays() |
|
9f5df4d…
|
lmata
|
111 |
lastStart = start |
|
9f5df4d…
|
lmata
|
112 |
} |
|
9f5df4d…
|
lmata
|
113 |
failures = 0 |
|
9f5df4d…
|
lmata
|
114 |
} |
|
9f5df4d…
|
lmata
|
115 |
} |
|
9f5df4d…
|
lmata
|
116 |
|
|
9f5df4d…
|
lmata
|
117 |
func getStart(client *http.Client, url, token string) string { |
|
9f5df4d…
|
lmata
|
118 |
req, err := http.NewRequest(http.MethodGet, url+"/v1/status", nil) |
|
9f5df4d…
|
lmata
|
119 |
if err != nil { |
|
9f5df4d…
|
lmata
|
120 |
return "" |
|
9f5df4d…
|
lmata
|
121 |
} |
|
9f5df4d…
|
lmata
|
122 |
req.Header.Set("Authorization", "Bearer "+token) |
|
9f5df4d…
|
lmata
|
123 |
resp, err := client.Do(req) |
|
9f5df4d…
|
lmata
|
124 |
if err != nil { |
|
9f5df4d…
|
lmata
|
125 |
return "" |
|
9f5df4d…
|
lmata
|
126 |
} |
|
9f5df4d…
|
lmata
|
127 |
defer resp.Body.Close() |
|
9f5df4d…
|
lmata
|
128 |
var s struct { |
|
9f5df4d…
|
lmata
|
129 |
Started string `json:"started"` |
|
9f5df4d…
|
lmata
|
130 |
} |
|
9f5df4d…
|
lmata
|
131 |
_ = json.NewDecoder(resp.Body).Decode(&s) |
|
9f5df4d…
|
lmata
|
132 |
return s.Started |
|
9f5df4d…
|
lmata
|
133 |
} |
|
9f5df4d…
|
lmata
|
134 |
|
|
9f5df4d…
|
lmata
|
135 |
func killRelays() { |
|
9f5df4d…
|
lmata
|
136 |
// Find relay processes and send SIGUSR1 to trigger IRC reconnection. |
|
9f5df4d…
|
lmata
|
137 |
// The relay handles SIGUSR1 by tearing down and rebuilding the IRC |
|
9f5df4d…
|
lmata
|
138 |
// connection without killing the Claude subprocess. |
|
9f5df4d…
|
lmata
|
139 |
out, err := exec.Command("pgrep", "-f", "(claude|codex|gemini)-relay").Output() |
|
9f5df4d…
|
lmata
|
140 |
if err != nil { |
|
9f5df4d…
|
lmata
|
141 |
fmt.Fprintf(os.Stderr, "relay-watchdog: no relay processes found\n") |
|
9f5df4d…
|
lmata
|
142 |
return |
|
9f5df4d…
|
lmata
|
143 |
} |
|
9f5df4d…
|
lmata
|
144 |
pids := strings.Fields(strings.TrimSpace(string(out))) |
|
9f5df4d…
|
lmata
|
145 |
myPid := fmt.Sprintf("%d", os.Getpid()) |
|
9f5df4d…
|
lmata
|
146 |
for _, pid := range pids { |
|
9f5df4d…
|
lmata
|
147 |
if pid == myPid { |
|
9f5df4d…
|
lmata
|
148 |
continue |
|
9f5df4d…
|
lmata
|
149 |
} |
|
9f5df4d…
|
lmata
|
150 |
fmt.Fprintf(os.Stderr, "relay-watchdog: signaling relay pid %s (SIGUSR1)\n", pid) |
|
9f5df4d…
|
lmata
|
151 |
_ = exec.Command("kill", "-USR1", pid).Run() |
|
9f5df4d…
|
lmata
|
152 |
} |
|
9f5df4d…
|
lmata
|
153 |
} |