|
1
|
// relay-watchdog monitors a scuttlebot server and signals relay processes |
|
2
|
// to reconnect when the server restarts or becomes unreachable. |
|
3
|
// |
|
4
|
// Usage: relay-watchdog --url https://irc.scuttlebot.net --token <token> --signal <pid> |
|
5
|
// |
|
6
|
// It polls the server's /v1/status endpoint every 10 seconds. When the |
|
7
|
// server's start time changes (restart) or the API is unreachable for 60 |
|
8
|
// seconds (network issue), it sends SIGUSR1 to the specified PID (or all |
|
9
|
// relay processes if --signal 0). |
|
10
|
package main |
|
11
|
|
|
12
|
import ( |
|
13
|
"encoding/json" |
|
14
|
"flag" |
|
15
|
"fmt" |
|
16
|
"net/http" |
|
17
|
"os" |
|
18
|
"os/exec" |
|
19
|
"os/signal" |
|
20
|
"strings" |
|
21
|
"syscall" |
|
22
|
"time" |
|
23
|
) |
|
24
|
|
|
25
|
func loadEnvFile(path string) { |
|
26
|
data, err := os.ReadFile(path) |
|
27
|
if err != nil { |
|
28
|
return |
|
29
|
} |
|
30
|
for _, line := range strings.Split(string(data), "\n") { |
|
31
|
line = strings.TrimSpace(line) |
|
32
|
if line == "" || strings.HasPrefix(line, "#") { |
|
33
|
continue |
|
34
|
} |
|
35
|
if k, v, ok := strings.Cut(line, "="); ok { |
|
36
|
k = strings.TrimSpace(k) |
|
37
|
v = strings.TrimSpace(v) |
|
38
|
if os.Getenv(k) == "" { // don't override explicit env |
|
39
|
os.Setenv(k, v) |
|
40
|
} |
|
41
|
} |
|
42
|
} |
|
43
|
} |
|
44
|
|
|
45
|
func main() { |
|
46
|
// Load the shared relay config. |
|
47
|
home, _ := os.UserHomeDir() |
|
48
|
if home != "" { |
|
49
|
loadEnvFile(home + "/.config/scuttlebot-relay.env") |
|
50
|
} |
|
51
|
|
|
52
|
url := flag.String("url", os.Getenv("SCUTTLEBOT_URL"), "scuttlebot API URL") |
|
53
|
token := flag.String("token", os.Getenv("SCUTTLEBOT_TOKEN"), "API token") |
|
54
|
interval := flag.Duration("interval", 10*time.Second, "poll interval") |
|
55
|
flag.Parse() |
|
56
|
|
|
57
|
if *url == "" || *token == "" { |
|
58
|
fmt.Fprintf(os.Stderr, "relay-watchdog: SCUTTLEBOT_URL and SCUTTLEBOT_TOKEN required\n") |
|
59
|
os.Exit(1) |
|
60
|
} |
|
61
|
|
|
62
|
// Handle graceful shutdown. |
|
63
|
sigCh := make(chan os.Signal, 1) |
|
64
|
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) |
|
65
|
|
|
66
|
var lastStart string |
|
67
|
failures := 0 |
|
68
|
client := &http.Client{Timeout: 5 * time.Second} |
|
69
|
|
|
70
|
fmt.Fprintf(os.Stderr, "relay-watchdog: monitoring %s every %s\n", *url, *interval) |
|
71
|
|
|
72
|
ticker := time.NewTicker(*interval) |
|
73
|
defer ticker.Stop() |
|
74
|
|
|
75
|
for { |
|
76
|
select { |
|
77
|
case <-sigCh: |
|
78
|
fmt.Fprintf(os.Stderr, "relay-watchdog: shutting down\n") |
|
79
|
return |
|
80
|
case <-ticker.C: |
|
81
|
} |
|
82
|
|
|
83
|
start := getStart(client, *url, *token) |
|
84
|
if start == "" { |
|
85
|
failures++ |
|
86
|
fmt.Fprintf(os.Stderr, "relay-watchdog: API unreachable (%d)\n", failures) |
|
87
|
if failures >= 6 { // 60s at 10s interval |
|
88
|
fmt.Fprintf(os.Stderr, "relay-watchdog: extended outage, will signal relays on recovery\n") |
|
89
|
} |
|
90
|
continue |
|
91
|
} |
|
92
|
|
|
93
|
if failures >= 6 { |
|
94
|
// We were down for a while and just came back. |
|
95
|
fmt.Fprintf(os.Stderr, "relay-watchdog: API recovered after %d failures, killing relays\n", failures) |
|
96
|
killRelays() |
|
97
|
lastStart = start |
|
98
|
failures = 0 |
|
99
|
continue |
|
100
|
} |
|
101
|
|
|
102
|
if lastStart == "" { |
|
103
|
lastStart = start |
|
104
|
failures = 0 |
|
105
|
continue |
|
106
|
} |
|
107
|
|
|
108
|
if start != lastStart { |
|
109
|
fmt.Fprintf(os.Stderr, "relay-watchdog: server restarted (was %s, now %s), killing relays\n", lastStart, start) |
|
110
|
killRelays() |
|
111
|
lastStart = start |
|
112
|
} |
|
113
|
failures = 0 |
|
114
|
} |
|
115
|
} |
|
116
|
|
|
117
|
func getStart(client *http.Client, url, token string) string { |
|
118
|
req, err := http.NewRequest(http.MethodGet, url+"/v1/status", nil) |
|
119
|
if err != nil { |
|
120
|
return "" |
|
121
|
} |
|
122
|
req.Header.Set("Authorization", "Bearer "+token) |
|
123
|
resp, err := client.Do(req) |
|
124
|
if err != nil { |
|
125
|
return "" |
|
126
|
} |
|
127
|
defer resp.Body.Close() |
|
128
|
var s struct { |
|
129
|
Started string `json:"started"` |
|
130
|
} |
|
131
|
_ = json.NewDecoder(resp.Body).Decode(&s) |
|
132
|
return s.Started |
|
133
|
} |
|
134
|
|
|
135
|
func killRelays() { |
|
136
|
// Find relay processes and send SIGUSR1 to trigger IRC reconnection. |
|
137
|
// The relay handles SIGUSR1 by tearing down and rebuilding the IRC |
|
138
|
// connection without killing the Claude subprocess. |
|
139
|
out, err := exec.Command("pgrep", "-f", "(claude|codex|gemini)-relay").Output() |
|
140
|
if err != nil { |
|
141
|
fmt.Fprintf(os.Stderr, "relay-watchdog: no relay processes found\n") |
|
142
|
return |
|
143
|
} |
|
144
|
pids := strings.Fields(strings.TrimSpace(string(out))) |
|
145
|
myPid := fmt.Sprintf("%d", os.Getpid()) |
|
146
|
for _, pid := range pids { |
|
147
|
if pid == myPid { |
|
148
|
continue |
|
149
|
} |
|
150
|
fmt.Fprintf(os.Stderr, "relay-watchdog: signaling relay pid %s (SIGUSR1)\n", pid) |
|
151
|
_ = exec.Command("kill", "-USR1", pid).Run() |
|
152
|
} |
|
153
|
} |
|
154
|
|