ScuttleBot

scuttlebot / cmd / relay-watchdog / main.go
Source Blame History 153 lines
9f5df4d… lmata 1 // relay-watchdog monitors a scuttlebot server and signals relay processes
9f5df4d… lmata 2 // to reconnect when the server restarts or becomes unreachable.
9f5df4d… lmata 3 //
9f5df4d… lmata 4 // Usage: relay-watchdog --url https://irc.scuttlebot.net --token <token> --signal <pid>
9f5df4d… lmata 5 //
9f5df4d… lmata 6 // It polls the server's /v1/status endpoint every 10 seconds. When the
9f5df4d… lmata 7 // server's start time changes (restart) or the API is unreachable for 60
9f5df4d… lmata 8 // seconds (network issue), it sends SIGUSR1 to the specified PID (or all
9f5df4d… lmata 9 // relay processes if --signal 0).
9f5df4d… lmata 10 package main
9f5df4d… lmata 11
9f5df4d… lmata 12 import (
9f5df4d… lmata 13 "encoding/json"
9f5df4d… lmata 14 "flag"
9f5df4d… lmata 15 "fmt"
9f5df4d… lmata 16 "net/http"
9f5df4d… lmata 17 "os"
9f5df4d… lmata 18 "os/exec"
9f5df4d… lmata 19 "os/signal"
9f5df4d… lmata 20 "strings"
9f5df4d… lmata 21 "syscall"
9f5df4d… lmata 22 "time"
9f5df4d… lmata 23 )
9f5df4d… lmata 24
9f5df4d… lmata 25 func loadEnvFile(path string) {
9f5df4d… lmata 26 data, err := os.ReadFile(path)
9f5df4d… lmata 27 if err != nil {
9f5df4d… lmata 28 return
9f5df4d… lmata 29 }
9f5df4d… lmata 30 for _, line := range strings.Split(string(data), "\n") {
9f5df4d… lmata 31 line = strings.TrimSpace(line)
9f5df4d… lmata 32 if line == "" || strings.HasPrefix(line, "#") {
9f5df4d… lmata 33 continue
9f5df4d… lmata 34 }
9f5df4d… lmata 35 if k, v, ok := strings.Cut(line, "="); ok {
9f5df4d… lmata 36 k = strings.TrimSpace(k)
9f5df4d… lmata 37 v = strings.TrimSpace(v)
9f5df4d… lmata 38 if os.Getenv(k) == "" { // don't override explicit env
9f5df4d… lmata 39 os.Setenv(k, v)
9f5df4d… lmata 40 }
9f5df4d… lmata 41 }
9f5df4d… lmata 42 }
9f5df4d… lmata 43 }
9f5df4d… lmata 44
9f5df4d… lmata 45 func main() {
9f5df4d… lmata 46 // Load the shared relay config.
9f5df4d… lmata 47 home, _ := os.UserHomeDir()
9f5df4d… lmata 48 if home != "" {
9f5df4d… lmata 49 loadEnvFile(home + "/.config/scuttlebot-relay.env")
9f5df4d… lmata 50 }
9f5df4d… lmata 51
9f5df4d… lmata 52 url := flag.String("url", os.Getenv("SCUTTLEBOT_URL"), "scuttlebot API URL")
9f5df4d… lmata 53 token := flag.String("token", os.Getenv("SCUTTLEBOT_TOKEN"), "API token")
9f5df4d… lmata 54 interval := flag.Duration("interval", 10*time.Second, "poll interval")
9f5df4d… lmata 55 flag.Parse()
9f5df4d… lmata 56
9f5df4d… lmata 57 if *url == "" || *token == "" {
9f5df4d… lmata 58 fmt.Fprintf(os.Stderr, "relay-watchdog: SCUTTLEBOT_URL and SCUTTLEBOT_TOKEN required\n")
9f5df4d… lmata 59 os.Exit(1)
9f5df4d… lmata 60 }
9f5df4d… lmata 61
9f5df4d… lmata 62 // Handle graceful shutdown.
9f5df4d… lmata 63 sigCh := make(chan os.Signal, 1)
9f5df4d… lmata 64 signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
9f5df4d… lmata 65
9f5df4d… lmata 66 var lastStart string
9f5df4d… lmata 67 failures := 0
9f5df4d… lmata 68 client := &http.Client{Timeout: 5 * time.Second}
9f5df4d… lmata 69
9f5df4d… lmata 70 fmt.Fprintf(os.Stderr, "relay-watchdog: monitoring %s every %s\n", *url, *interval)
9f5df4d… lmata 71
9f5df4d… lmata 72 ticker := time.NewTicker(*interval)
9f5df4d… lmata 73 defer ticker.Stop()
9f5df4d… lmata 74
9f5df4d… lmata 75 for {
9f5df4d… lmata 76 select {
9f5df4d… lmata 77 case <-sigCh:
9f5df4d… lmata 78 fmt.Fprintf(os.Stderr, "relay-watchdog: shutting down\n")
9f5df4d… lmata 79 return
9f5df4d… lmata 80 case <-ticker.C:
9f5df4d… lmata 81 }
9f5df4d… lmata 82
9f5df4d… lmata 83 start := getStart(client, *url, *token)
9f5df4d… lmata 84 if start == "" {
9f5df4d… lmata 85 failures++
9f5df4d… lmata 86 fmt.Fprintf(os.Stderr, "relay-watchdog: API unreachable (%d)\n", failures)
9f5df4d… lmata 87 if failures >= 6 { // 60s at 10s interval
9f5df4d… lmata 88 fmt.Fprintf(os.Stderr, "relay-watchdog: extended outage, will signal relays on recovery\n")
9f5df4d… lmata 89 }
9f5df4d… lmata 90 continue
9f5df4d… lmata 91 }
9f5df4d… lmata 92
9f5df4d… lmata 93 if failures >= 6 {
9f5df4d… lmata 94 // We were down for a while and just came back.
9f5df4d… lmata 95 fmt.Fprintf(os.Stderr, "relay-watchdog: API recovered after %d failures, killing relays\n", failures)
9f5df4d… lmata 96 killRelays()
9f5df4d… lmata 97 lastStart = start
9f5df4d… lmata 98 failures = 0
9f5df4d… lmata 99 continue
9f5df4d… lmata 100 }
9f5df4d… lmata 101
9f5df4d… lmata 102 if lastStart == "" {
9f5df4d… lmata 103 lastStart = start
9f5df4d… lmata 104 failures = 0
9f5df4d… lmata 105 continue
9f5df4d… lmata 106 }
9f5df4d… lmata 107
9f5df4d… lmata 108 if start != lastStart {
9f5df4d… lmata 109 fmt.Fprintf(os.Stderr, "relay-watchdog: server restarted (was %s, now %s), killing relays\n", lastStart, start)
9f5df4d… lmata 110 killRelays()
9f5df4d… lmata 111 lastStart = start
9f5df4d… lmata 112 }
9f5df4d… lmata 113 failures = 0
9f5df4d… lmata 114 }
9f5df4d… lmata 115 }
9f5df4d… lmata 116
9f5df4d… lmata 117 func getStart(client *http.Client, url, token string) string {
9f5df4d… lmata 118 req, err := http.NewRequest(http.MethodGet, url+"/v1/status", nil)
9f5df4d… lmata 119 if err != nil {
9f5df4d… lmata 120 return ""
9f5df4d… lmata 121 }
9f5df4d… lmata 122 req.Header.Set("Authorization", "Bearer "+token)
9f5df4d… lmata 123 resp, err := client.Do(req)
9f5df4d… lmata 124 if err != nil {
9f5df4d… lmata 125 return ""
9f5df4d… lmata 126 }
9f5df4d… lmata 127 defer resp.Body.Close()
9f5df4d… lmata 128 var s struct {
9f5df4d… lmata 129 Started string `json:"started"`
9f5df4d… lmata 130 }
9f5df4d… lmata 131 _ = json.NewDecoder(resp.Body).Decode(&s)
9f5df4d… lmata 132 return s.Started
9f5df4d… lmata 133 }
9f5df4d… lmata 134
9f5df4d… lmata 135 func killRelays() {
9f5df4d… lmata 136 // Find relay processes and send SIGUSR1 to trigger IRC reconnection.
9f5df4d… lmata 137 // The relay handles SIGUSR1 by tearing down and rebuilding the IRC
9f5df4d… lmata 138 // connection without killing the Claude subprocess.
9f5df4d… lmata 139 out, err := exec.Command("pgrep", "-f", "(claude|codex|gemini)-relay").Output()
9f5df4d… lmata 140 if err != nil {
9f5df4d… lmata 141 fmt.Fprintf(os.Stderr, "relay-watchdog: no relay processes found\n")
9f5df4d… lmata 142 return
9f5df4d… lmata 143 }
9f5df4d… lmata 144 pids := strings.Fields(strings.TrimSpace(string(out)))
9f5df4d… lmata 145 myPid := fmt.Sprintf("%d", os.Getpid())
9f5df4d… lmata 146 for _, pid := range pids {
9f5df4d… lmata 147 if pid == myPid {
9f5df4d… lmata 148 continue
9f5df4d… lmata 149 }
9f5df4d… lmata 150 fmt.Fprintf(os.Stderr, "relay-watchdog: signaling relay pid %s (SIGUSR1)\n", pid)
9f5df4d… lmata 151 _ = exec.Command("kill", "-USR1", pid).Run()
9f5df4d… lmata 152 }
9f5df4d… lmata 153 }

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button