From a58b7682f3b148571143fcb2499b56df4b5f1179 Mon Sep 17 00:00:00 2001 From: GigabiteStudios Date: Sat, 20 Jun 2026 18:08:00 -0500 Subject: [PATCH] feat(core): add uptime monitoring and command history --- internal/app/app.go | 2 + internal/config/config.go | 2 +- internal/db/db.go | 42 +++ internal/handlers/handlers.go | 272 +++++++++++++- internal/models/models.go | 56 +++ internal/services/node.go | 519 ++++++++++++++++++++++++++- internal/views/pages/settings.gohtml | 77 ++-- internal/views/pages/uptime.gohtml | 182 ++++++++++ 8 files changed, 1091 insertions(+), 61 deletions(-) create mode 100644 internal/views/pages/uptime.gohtml diff --git a/internal/app/app.go b/internal/app/app.go index 86b3d3b..f76f767 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -96,6 +96,7 @@ func New() (*App, error) { protected.Get("/nodes/{nodeID}/console/ws", handler.NodeConsoleWebSocket) protected.Get("/groups", handler.GroupsPage) protected.Get("/automations", handler.AutomationsPage) + protected.Get("/uptime", handler.UptimePage) protected.Get("/settings", handler.SettingsPage) protected.Post("/settings/theme", handler.UpdateTheme) @@ -107,6 +108,7 @@ func New() (*App, error) { editor.Post("/nodes/{nodeID}/commands", handler.NodeQuickCommand) editor.Post("/nodes/{nodeID}/delete", handler.DeleteNode) editor.Post("/automations", handler.CreateAutomation) + editor.Post("/uptime/run", handler.RunUptimeChecks) }) }) diff --git a/internal/config/config.go b/internal/config/config.go index d0a6a93..bc5892f 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -25,7 +25,7 @@ func Load() Config { EncryptionKey: env("MAINTAINARR_ENCRYPTION_KEY", "change-me-encryption-key-32bytes"), OrgName: env("MAINTAINARR_ORG_NAME", "Maintainarr"), BaseURL: env("MAINTAINARR_BASE_URL", "http://localhost:8080"), - DefaultTheme: env("MAINTAINARR_THEME", "blue"), + DefaultTheme: env("MAINTAINARR_THEME", "dark"), DefaultMode: env("MAINTAINARR_THEME_MODE", "dark"), RefreshCron: env("MAINTAINARR_REFRESH_CRON", "@every 5s"), } diff --git a/internal/db/db.go b/internal/db/db.go index cb8ce09..cb89734 100644 --- a/internal/db/db.go +++ b/internal/db/db.go @@ -127,6 +127,7 @@ func migrate(ctx context.Context, database *sql.DB) error { job_id INTEGER, node_id INTEGER NOT NULL, action TEXT NOT NULL, + command_text TEXT NOT NULL DEFAULT '', status TEXT NOT NULL, output TEXT NOT NULL DEFAULT '', triggered_by INTEGER, @@ -136,6 +137,44 @@ func migrate(ctx context.Context, database *sql.DB) error { FOREIGN KEY (node_id) REFERENCES nodes(id), FOREIGN KEY (triggered_by) REFERENCES users(id) );`, + `CREATE TABLE IF NOT EXISTS uptime_monitors ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + organization_id INTEGER NOT NULL, + node_id INTEGER NOT NULL UNIQUE, + name TEXT NOT NULL, + target TEXT NOT NULL, + monitor_type TEXT NOT NULL DEFAULT 'ssh', + interval_seconds INTEGER NOT NULL DEFAULT 60, + enabled BOOLEAN NOT NULL DEFAULT 1, + last_status TEXT NOT NULL DEFAULT 'unknown', + last_latency_ms INTEGER NOT NULL DEFAULT 0, + last_checked_at DATETIME, + last_error TEXT NOT NULL DEFAULT '', + up_since_at DATETIME, + current_outage_started_at DATETIME, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (organization_id) REFERENCES organizations(id), + FOREIGN KEY (node_id) REFERENCES nodes(id) + );`, + `CREATE TABLE IF NOT EXISTS uptime_checks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + monitor_id INTEGER NOT NULL, + status TEXT NOT NULL, + latency_ms INTEGER NOT NULL DEFAULT 0, + error_message TEXT NOT NULL DEFAULT '', + checked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (monitor_id) REFERENCES uptime_monitors(id) + );`, + `CREATE TABLE IF NOT EXISTS uptime_incidents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + monitor_id INTEGER NOT NULL, + error_message TEXT NOT NULL DEFAULT '', + started_at DATETIME NOT NULL, + ended_at DATETIME, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (monitor_id) REFERENCES uptime_monitors(id) + );`, } for _, statement := range statements { @@ -158,6 +197,9 @@ func migrate(ctx context.Context, database *sql.DB) error { `ALTER TABLE nodes ADD COLUMN memory_total_mb INTEGER NOT NULL DEFAULT 0;`, `ALTER TABLE nodes ADD COLUMN disk_total_gb INTEGER NOT NULL DEFAULT 0;`, `ALTER TABLE automation_jobs ADD COLUMN tag TEXT NOT NULL DEFAULT '';`, + `ALTER TABLE command_runs ADD COLUMN command_text TEXT NOT NULL DEFAULT '';`, + `ALTER TABLE uptime_monitors ADD COLUMN last_error TEXT NOT NULL DEFAULT '';`, + `ALTER TABLE uptime_monitors ADD COLUMN up_since_at DATETIME;`, } for _, statement := range alterStatements { diff --git a/internal/handlers/handlers.go b/internal/handlers/handlers.go index e23ca66..05cb05e 100644 --- a/internal/handlers/handlers.go +++ b/internal/handlers/handlers.go @@ -57,6 +57,7 @@ type settingsData struct { ThemeVariables template.CSS CurrentTheme string CurrentMode string + Runs []models.CommandRun } type jobsPageData struct { @@ -67,6 +68,39 @@ type jobsPageData struct { Runs []models.CommandRun } +type uptimePageData struct { + Summary uptimeSummary + Periods []uptimePeriodRow + Monitors []uptimeMonitorCard + Incidents []models.UptimeIncident +} + +type uptimeSummary struct { + TotalMonitors int + UpMonitors int + DownMonitors int + AvgLatencyMS int64 +} + +type uptimePeriodRow struct { + Label string + AvailabilityText string + DowntimeText string + Incidents int64 + LongestText string + AverageText string +} + +type uptimeMonitorCard struct { + Monitor models.UptimeMonitor + Availability float64 + AvailabilityText string + LastCheckedText string + StateDurationText string + IntervalText string + RecentChecks []models.UptimeCheck +} + func New(repo *services.Repository, auth *services.AuthService, sessions *services.SessionService, nodes *services.NodeService, renderer *views.Renderer, org models.Organization, baseURL string) *Handler { return &Handler{ repo: repo, @@ -336,11 +370,16 @@ func (h *Handler) CreateNode(w http.ResponseWriter, r *http.Request) { http.Error(w, "failed to create node", http.StatusInternalServerError) return } + if err := h.nodes.EnsureUptimeMonitorForNode(r.Context(), node); err != nil { + http.Error(w, "failed to create node monitor", http.StatusInternalServerError) + return + } if node.SSHUsername != "" && node.SSHPassword != "" { _, _ = h.nodes.RefreshNodeInventory(r.Context(), node) _, _ = h.nodes.RefreshNodeStats(r.Context(), node) } + _ = h.nodes.RunAllUptimeChecks(r.Context(), h.org.ID) http.Redirect(w, r, "/dashboard", http.StatusSeeOther) } @@ -710,13 +749,33 @@ func (h *Handler) CreateAutomation(w http.ResponseWriter, r *http.Request) { func (h *Handler) SettingsPage(w http.ResponseWriter, r *http.Request) { user := localmiddleware.CurrentUser(r) org := h.currentOrganization(r.Context()) + currentMode := normalizeMode(org.ThemeMode) + currentTheme := normalizeTheme(org.Theme, currentMode) h.render(w, r, "settings", "Theme System", settingsData{ ThemeVariables: template.CSS(themePreview), - CurrentTheme: org.Theme, - CurrentMode: org.ThemeMode, + CurrentTheme: currentTheme, + CurrentMode: currentMode, + Runs: h.settingsRuns(r.Context()), }, user) } +func (h *Handler) UptimePage(w http.ResponseWriter, r *http.Request) { + user := localmiddleware.CurrentUser(r) + h.render(w, r, "uptime", "Uptime", h.uptimePageData(r.Context()), user) +} + +func (h *Handler) RunUptimeChecks(w http.ResponseWriter, r *http.Request) { + if err := h.nodes.EnsureUptimeMonitors(r.Context(), h.org.ID); err != nil { + http.Error(w, "failed to sync monitors", http.StatusInternalServerError) + return + } + if err := h.nodes.RunAllUptimeChecks(r.Context(), h.org.ID); err != nil { + http.Error(w, "failed to run uptime checks", http.StatusBadGateway) + return + } + http.Redirect(w, r, "/uptime", http.StatusSeeOther) +} + func (h *Handler) UpdateTheme(w http.ResponseWriter, r *http.Request) { if err := r.ParseForm(); err != nil { http.Error(w, "bad request", http.StatusBadRequest) @@ -725,6 +784,9 @@ func (h *Handler) UpdateTheme(w http.ResponseWriter, r *http.Request) { theme := strings.TrimSpace(r.FormValue("theme")) mode := strings.TrimSpace(r.FormValue("mode")) + if mode == "" { + mode = theme + } if !isAllowedTheme(theme) || !isAllowedMode(mode) { http.Error(w, "invalid theme selection", http.StatusBadRequest) return @@ -750,12 +812,14 @@ func (h *Handler) render(w http.ResponseWriter, r *http.Request, page, title str org := h.currentOrganization(r.Context()) groups, _ := h.repo.ListGroups(r.Context(), org.ID) tags, _ := h.repo.ListTags(r.Context(), org.ID) + mode := normalizeMode(org.ThemeMode) + theme := normalizeTheme(org.Theme, mode) h.renderer.Render(w, page, views.ViewData{ Title: title, Shell: shellForPage(page), - ThemeClass: org.Theme, - ThemeMode: org.ThemeMode, + ThemeClass: theme, + ThemeMode: mode, CurrentPath: r.URL.Path, User: currentUser, Organization: org, @@ -797,6 +861,75 @@ func (h *Handler) dashboardData(ctx context.Context) dashboardData { } } +func (h *Handler) uptimePageData(ctx context.Context) uptimePageData { + _ = h.nodes.EnsureUptimeMonitors(ctx, h.org.ID) + + monitors, _ := h.nodes.ListUptimeMonitors(ctx, h.org.ID) + checksByMonitor, _ := h.nodes.ListRecentUptimeChecks(ctx, h.org.ID, 24) + incidents, _ := h.nodes.ListRecentUptimeIncidents(ctx, h.org.ID, 20) + + summary := uptimeSummary{ + TotalMonitors: len(monitors), + } + var latencyTotal int64 + var latencyCount int64 + cards := make([]uptimeMonitorCard, 0, len(monitors)) + for _, monitor := range monitors { + if monitor.LastStatus == "down" { + summary.DownMonitors++ + } else if monitor.LastStatus == "up" { + summary.UpMonitors++ + } + if monitor.LastStatus == "up" && monitor.LastLatencyMS > 0 { + latencyTotal += monitor.LastLatencyMS + latencyCount++ + } + + recentChecks := checksByMonitor[monitor.ID] + card := uptimeMonitorCard{ + Monitor: monitor, + Availability: availabilityForChecks(recentChecks), + AvailabilityText: fmt.Sprintf("%.2f%%", availabilityForChecks(recentChecks)), + LastCheckedText: relativeTime(monitor.LastCheckedAt), + StateDurationText: monitorStateDuration(monitor), + IntervalText: humanizeInterval(monitor.IntervalSeconds), + RecentChecks: reverseChecks(recentChecks), + } + cards = append(cards, card) + } + if latencyCount > 0 { + summary.AvgLatencyMS = latencyTotal / latencyCount + } + + now := time.Now() + periods := []uptimePeriodRow{ + h.periodRow(ctx, "Today", ptrTime(now.Add(-24*time.Hour))), + h.periodRow(ctx, "Last 7 days", ptrTime(now.Add(-7*24*time.Hour))), + h.periodRow(ctx, "Last 30 days", ptrTime(now.Add(-30*24*time.Hour))), + h.periodRow(ctx, "Last 365 days", ptrTime(now.Add(-365*24*time.Hour))), + h.periodRow(ctx, "All time", nil), + } + + return uptimePageData{ + Summary: summary, + Periods: periods, + Monitors: cards, + Incidents: incidents, + } +} + +func (h *Handler) periodRow(ctx context.Context, label string, since *time.Time) uptimePeriodRow { + summary, _ := h.nodes.UptimePeriodSummary(ctx, h.org.ID, since) + return uptimePeriodRow{ + Label: label, + AvailabilityText: availabilityText(summary), + DowntimeText: humanizeSeconds(summary.DowntimeSeconds), + Incidents: summary.IncidentCount, + LongestText: humanizeSeconds(summary.LongestIncidentSeconds), + AverageText: humanizeSeconds(summary.AvgIncidentSeconds), + } +} + func (h *Handler) currentOrganization(ctx context.Context) models.Organization { org, err := h.repo.GetOrganization(ctx) if err != nil { @@ -805,6 +938,109 @@ func (h *Handler) currentOrganization(ctx context.Context) models.Organization { return org } +func availabilityForChecks(checks []models.UptimeCheck) float64 { + if len(checks) == 0 { + return 0 + } + var up int + for _, check := range checks { + if check.Status == "up" { + up++ + } + } + return float64(up) * 100 / float64(len(checks)) +} + +func availabilityText(summary models.UptimePeriodSummary) string { + if summary.TotalChecks == 0 { + return "0.00%" + } + return fmt.Sprintf("%.2f%%", float64(summary.UpChecks)*100/float64(summary.TotalChecks)) +} + +func relativeTime(value *time.Time) string { + if value == nil { + return "Never" + } + diff := time.Since(*value).Round(time.Second) + if diff < time.Minute { + seconds := int(diff.Seconds()) + if seconds < 1 { + seconds = 1 + } + return fmt.Sprintf("%ds ago", seconds) + } + if diff < time.Hour { + return fmt.Sprintf("%dm ago", int(diff.Minutes())) + } + if diff < 24*time.Hour { + return fmt.Sprintf("%dh ago", int(diff.Hours())) + } + return value.Format("2006-01-02 15:04") +} + +func humanizeInterval(seconds int64) string { + if seconds <= 0 { + return "-" + } + if seconds%3600 == 0 { + hours := seconds / 3600 + if hours == 1 { + return "Every hour" + } + return fmt.Sprintf("Every %d hours", hours) + } + if seconds%60 == 0 { + minutes := seconds / 60 + if minutes == 1 { + return "Every minute" + } + return fmt.Sprintf("Every %d minutes", minutes) + } + return fmt.Sprintf("Every %d seconds", seconds) +} + +func monitorStateDuration(monitor models.UptimeMonitor) string { + if monitor.LastStatus == "up" && monitor.UpSinceAt != nil { + return "Up for " + humanizeSeconds(int64(time.Since(*monitor.UpSinceAt).Seconds())) + } + if monitor.LastStatus == "down" && monitor.CurrentOutageStartedAt != nil { + return "Down for " + humanizeSeconds(int64(time.Since(*monitor.CurrentOutageStartedAt).Seconds())) + } + return "Awaiting checks" +} + +func humanizeSeconds(seconds int64) string { + if seconds <= 0 { + return "-" + } + duration := time.Duration(seconds) * time.Second + days := int(duration / (24 * time.Hour)) + duration -= time.Duration(days) * 24 * time.Hour + hours := int(duration / time.Hour) + duration -= time.Duration(hours) * time.Hour + minutes := int(duration / time.Minute) + if days > 0 { + return fmt.Sprintf("%dd %dh %dm", days, hours, minutes) + } + if hours > 0 { + return fmt.Sprintf("%dh %dm", hours, minutes) + } + return fmt.Sprintf("%dm", minutes) +} + +func ptrTime(value time.Time) *time.Time { + return &value +} + +func reverseChecks(checks []models.UptimeCheck) []models.UptimeCheck { + reversed := make([]models.UptimeCheck, len(checks)) + for i := range checks { + reversed[len(checks)-1-i] = checks[i] + } + return reversed +} + func shellForPage(page string) string { switch page { case "login", "login_otp", "register": @@ -826,7 +1062,7 @@ func (h *Handler) listRuns(ctx context.Context, nodeID int64) []models.CommandRu var runs []models.CommandRun for rows.Next() { var run models.CommandRun - if err := rows.Scan(&run.ID, &run.JobID, &run.NodeID, &run.Action, &run.Status, &run.Output, &run.TriggeredBy, &run.StartedAt, &run.FinishedAt); err == nil { + if err := rows.Scan(&run.ID, &run.JobID, &run.NodeID, &run.Action, &run.CommandText, &run.Status, &run.Output, &run.TriggeredBy, &run.StartedAt, &run.FinishedAt); err == nil { runs = append(runs, run) } } @@ -835,7 +1071,7 @@ func (h *Handler) listRuns(ctx context.Context, nodeID int64) []models.CommandRu func (h *Handler) nodesRunQuery(ctx context.Context, nodeID int64) (*sql.Rows, error) { return h.nodesDB().QueryContext(ctx, ` - SELECT id, job_id, node_id, action, status, output, triggered_by, started_at, finished_at + SELECT id, job_id, node_id, action, command_text, status, output, triggered_by, started_at, finished_at FROM command_runs WHERE node_id = ? ORDER BY started_at DESC @@ -843,6 +1079,14 @@ func (h *Handler) nodesRunQuery(ctx context.Context, nodeID int64) (*sql.Rows, e `, nodeID) } +func (h *Handler) settingsRuns(ctx context.Context) []models.CommandRun { + runs, err := h.nodes.ListCommandHistory(ctx, h.org.ID) + if err != nil { + return nil + } + return runs +} + func (h *Handler) nodesDB() *sql.DB { return h.repo.DB() } @@ -905,7 +1149,7 @@ func defaultIfEmpty(value, fallback string) string { func isAllowedTheme(value string) bool { switch value { - case "dark", "light", "green", "red", "blue": + case "dark", "light": return true default: return false @@ -916,6 +1160,20 @@ func isAllowedMode(value string) bool { return value == "dark" || value == "light" } +func normalizeTheme(theme, mode string) string { + if isAllowedTheme(theme) { + return theme + } + return normalizeMode(mode) +} + +func normalizeMode(mode string) string { + if isAllowedMode(mode) { + return mode + } + return "dark" +} + func saveKeyUpload(r *http.Request, field string) (string, error) { file, header, err := r.FormFile(field) if err != nil { diff --git a/internal/models/models.go b/internal/models/models.go index 31036dd..e338cac 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -98,7 +98,9 @@ type CommandRun struct { NodeID int64 JobName string NodeName string + GroupName string Action string + CommandText string Status string Output string TriggeredBy *int64 @@ -106,3 +108,57 @@ type CommandRun struct { FinishedAt *time.Time DurationText string } + +type UptimeMonitor struct { + ID int64 + OrganizationID int64 + NodeID int64 + NodeName string + GroupName string + Name string + Target string + MonitorType string + IntervalSeconds int64 + Enabled bool + LastStatus string + LastLatencyMS int64 + LastCheckedAt *time.Time + LastError string + UpSinceAt *time.Time + CurrentOutageStartedAt *time.Time + CreatedAt time.Time + UpdatedAt time.Time +} + +type UptimeCheck struct { + ID int64 + MonitorID int64 + Status string + LatencyMS int64 + ErrorMessage string + CheckedAt time.Time +} + +type UptimeIncident struct { + ID int64 + MonitorID int64 + MonitorName string + NodeName string + GroupName string + ErrorMessage string + StartedAt time.Time + EndedAt *time.Time + DurationSeconds int64 + DurationText string +} + +type UptimePeriodSummary struct { + TotalChecks int64 + UpChecks int64 + DownChecks int64 + AvgLatencyMS int64 + DowntimeSeconds int64 + IncidentCount int64 + LongestIncidentSeconds int64 + AvgIncidentSeconds int64 +} diff --git a/internal/services/node.go b/internal/services/node.go index bf587e5..72685d4 100644 --- a/internal/services/node.go +++ b/internal/services/node.go @@ -5,6 +5,7 @@ import ( "database/sql" "fmt" "net" + "regexp" "strings" "time" @@ -172,6 +173,39 @@ func (s *NodeService) SaveNode(ctx context.Context, node *models.Node) error { return err } +func (s *NodeService) EnsureUptimeMonitorForNode(ctx context.Context, node *models.Node) error { + target := fmt.Sprintf("%s:%d", strings.TrimSpace(node.IPAddress), node.SSHPort) + name := strings.TrimSpace(node.Name) + if name == "" { + name = target + } + + _, err := s.db.ExecContext(ctx, ` + INSERT INTO uptime_monitors ( + organization_id, node_id, name, target, monitor_type, interval_seconds, enabled + ) VALUES (?, ?, ?, ?, 'ssh', 60, 1) + ON CONFLICT(node_id) DO UPDATE SET + organization_id = excluded.organization_id, + name = excluded.name, + target = excluded.target, + updated_at = CURRENT_TIMESTAMP + `, node.OrganizationID, node.ID, name, target) + return err +} + +func (s *NodeService) EnsureUptimeMonitors(ctx context.Context, orgID int64) error { + nodes, err := s.ListNodes(ctx, orgID) + if err != nil { + return err + } + for i := range nodes { + if err := s.EnsureUptimeMonitorForNode(ctx, &nodes[i]); err != nil { + return err + } + } + return nil +} + func (s *NodeService) DeleteNode(ctx context.Context, orgID, nodeID int64) error { tx, err := s.db.BeginTx(ctx, nil) if err != nil { @@ -237,6 +271,13 @@ echo "UPTIME=${uptime:-0}" output, err := s.RunSSHCommand(ctx, node, strings.TrimSpace(statsScript)) if err != nil { + s.logCommandRun(ctx, commandRunParams{ + NodeID: node.ID, + Action: "refresh-stats", + CommandText: sanitizeCommand(statsScript), + Status: "failed", + Output: strings.TrimSpace(output + "\n" + err.Error()), + }) return "", err } @@ -270,6 +311,13 @@ echo "UPTIME=${uptime:-0}" node.DiskUsage = stats["DISK"] node.UptimeSeconds = int64(stats["UPTIME"]) node.LastSeenAt = &now + s.logCommandRun(ctx, commandRunParams{ + NodeID: node.ID, + Action: "refresh-stats", + CommandText: sanitizeCommand(statsScript), + Status: "completed", + Output: output, + }) return output, nil } @@ -304,6 +352,15 @@ func (s *NodeService) RefreshNodeInventory(ctx context.Context, node *models.Nod `DISK_GB="$(df -BG / 2>/dev/null | awk 'NR==2 {gsub(/G/, "", $2); print $2}')"; echo DISK_GB="${DISK_GB:-0}"`, }, " ; ")) if err != nil { + s.logCommandRun(ctx, commandRunParams{ + NodeID: node.ID, + Action: "refresh-inventory", + CommandText: sanitizeCommand(strings.Join([]string{ + `read /etc/os-release, hostname, kernel, package manager, cpu, gpu, shell, package count, memory, disk`, + }, "")), + Status: "failed", + Output: strings.TrimSpace(output + "\n" + err.Error()), + }) return "", err } @@ -353,6 +410,14 @@ func (s *NodeService) RefreshNodeInventory(ctx context.Context, node *models.Nod return output, err } + s.logCommandRun(ctx, commandRunParams{ + NodeID: node.ID, + Action: "refresh-inventory", + CommandText: "inventory probe", + Status: "completed", + Output: output, + }) + return output, nil } @@ -419,10 +484,14 @@ func (s *NodeService) RunAction(ctx context.Context, node *models.Node, action s status = "failed" } - _, _ = s.db.ExecContext(ctx, ` - INSERT INTO command_runs (node_id, action, status, output, triggered_by) - VALUES (?, ?, ?, ?, ?) - `, node.ID, action, status, output, userID) + s.logCommandRun(ctx, commandRunParams{ + NodeID: node.ID, + Action: action, + CommandText: sanitizeCommand(command), + Status: status, + Output: output, + TriggeredBy: userID, + }) return output, err } @@ -436,10 +505,16 @@ func (s *NodeService) RunAdHocCommand(ctx context.Context, node *models.Node, la } now := time.Now() - _, _ = s.db.ExecContext(ctx, ` - INSERT INTO command_runs (node_id, action, status, output, triggered_by, started_at, finished_at) - VALUES (?, ?, ?, ?, ?, ?, ?) - `, node.ID, label, status, output, userID, now, now) + s.logCommandRun(ctx, commandRunParams{ + NodeID: node.ID, + Action: label, + CommandText: sanitizeCommand(command), + Status: status, + Output: output, + TriggeredBy: userID, + StartedAt: &now, + FinishedAt: &now, + }) return output, err } @@ -487,11 +562,12 @@ func (s *NodeService) CreateAutomation(ctx context.Context, job *models.Automati func (s *NodeService) ListJobRuns(ctx context.Context, orgID int64) ([]models.CommandRun, error) { rows, err := s.db.QueryContext(ctx, ` - SELECT cr.id, cr.job_id, cr.node_id, cr.action, cr.status, cr.output, cr.triggered_by, - cr.started_at, cr.finished_at, COALESCE(j.name, ''), COALESCE(n.name, '') + SELECT cr.id, cr.job_id, cr.node_id, cr.action, cr.command_text, cr.status, cr.output, cr.triggered_by, + cr.started_at, cr.finished_at, COALESCE(j.name, ''), COALESCE(n.name, ''), COALESCE(g.name, '') FROM command_runs cr LEFT JOIN automation_jobs j ON j.id = cr.job_id LEFT JOIN nodes n ON n.id = cr.node_id + LEFT JOIN vm_groups g ON g.id = n.group_id WHERE j.organization_id = ? OR (j.id IS NULL AND n.organization_id = ?) ORDER BY cr.started_at DESC LIMIT 50 @@ -505,8 +581,8 @@ func (s *NodeService) ListJobRuns(ctx context.Context, orgID int64) ([]models.Co for rows.Next() { var run models.CommandRun if err := rows.Scan( - &run.ID, &run.JobID, &run.NodeID, &run.Action, &run.Status, &run.Output, &run.TriggeredBy, - &run.StartedAt, &run.FinishedAt, &run.JobName, &run.NodeName, + &run.ID, &run.JobID, &run.NodeID, &run.Action, &run.CommandText, &run.Status, &run.Output, &run.TriggeredBy, + &run.StartedAt, &run.FinishedAt, &run.JobName, &run.NodeName, &run.GroupName, ); err != nil { return nil, err } @@ -517,6 +593,308 @@ func (s *NodeService) ListJobRuns(ctx context.Context, orgID int64) ([]models.Co return runs, rows.Err() } +func (s *NodeService) ListCommandHistory(ctx context.Context, orgID int64) ([]models.CommandRun, error) { + rows, err := s.db.QueryContext(ctx, ` + SELECT cr.id, cr.job_id, cr.node_id, cr.action, cr.command_text, cr.status, cr.output, cr.triggered_by, + cr.started_at, cr.finished_at, COALESCE(j.name, ''), COALESCE(n.name, ''), COALESCE(g.name, '') + FROM command_runs cr + LEFT JOIN automation_jobs j ON j.id = cr.job_id + LEFT JOIN nodes n ON n.id = cr.node_id + LEFT JOIN vm_groups g ON g.id = n.group_id + WHERE n.organization_id = ? OR j.organization_id = ? + ORDER BY cr.started_at DESC + LIMIT 200 + `, orgID, orgID) + if err != nil { + return nil, err + } + defer rows.Close() + + var runs []models.CommandRun + for rows.Next() { + var run models.CommandRun + if err := rows.Scan( + &run.ID, &run.JobID, &run.NodeID, &run.Action, &run.CommandText, &run.Status, &run.Output, &run.TriggeredBy, + &run.StartedAt, &run.FinishedAt, &run.JobName, &run.NodeName, &run.GroupName, + ); err != nil { + return nil, err + } + run.DurationText = formatDuration(run.StartedAt, run.FinishedAt) + runs = append(runs, run) + } + return runs, rows.Err() +} + +func (s *NodeService) ListUptimeMonitors(ctx context.Context, orgID int64) ([]models.UptimeMonitor, error) { + rows, err := s.db.QueryContext(ctx, ` + SELECT m.id, m.organization_id, m.node_id, COALESCE(n.name, ''), COALESCE(g.name, ''), + m.name, m.target, m.monitor_type, m.interval_seconds, m.enabled, m.last_status, + m.last_latency_ms, m.last_checked_at, m.last_error, m.up_since_at, m.current_outage_started_at, + m.created_at, m.updated_at + FROM uptime_monitors m + LEFT JOIN nodes n ON n.id = m.node_id + LEFT JOIN vm_groups g ON g.id = n.group_id + WHERE m.organization_id = ? + ORDER BY m.name ASC + `, orgID) + if err != nil { + return nil, err + } + defer rows.Close() + + var monitors []models.UptimeMonitor + for rows.Next() { + var monitor models.UptimeMonitor + if err := rows.Scan( + &monitor.ID, &monitor.OrganizationID, &monitor.NodeID, &monitor.NodeName, &monitor.GroupName, + &monitor.Name, &monitor.Target, &monitor.MonitorType, &monitor.IntervalSeconds, &monitor.Enabled, &monitor.LastStatus, + &monitor.LastLatencyMS, &monitor.LastCheckedAt, &monitor.LastError, &monitor.UpSinceAt, &monitor.CurrentOutageStartedAt, + &monitor.CreatedAt, &monitor.UpdatedAt, + ); err != nil { + return nil, err + } + monitors = append(monitors, monitor) + } + return monitors, rows.Err() +} + +func (s *NodeService) ListRecentUptimeChecks(ctx context.Context, orgID int64, limitPerMonitor int) (map[int64][]models.UptimeCheck, error) { + if limitPerMonitor <= 0 { + limitPerMonitor = 24 + } + + rows, err := s.db.QueryContext(ctx, ` + SELECT c.id, c.monitor_id, c.status, c.latency_ms, c.error_message, c.checked_at + FROM uptime_checks c + INNER JOIN uptime_monitors m ON m.id = c.monitor_id + WHERE m.organization_id = ? + ORDER BY c.checked_at DESC + LIMIT 500 + `, orgID) + if err != nil { + return nil, err + } + defer rows.Close() + + results := map[int64][]models.UptimeCheck{} + for rows.Next() { + var check models.UptimeCheck + if err := rows.Scan(&check.ID, &check.MonitorID, &check.Status, &check.LatencyMS, &check.ErrorMessage, &check.CheckedAt); err != nil { + return nil, err + } + if len(results[check.MonitorID]) >= limitPerMonitor { + continue + } + results[check.MonitorID] = append(results[check.MonitorID], check) + } + return results, rows.Err() +} + +func (s *NodeService) ListRecentUptimeIncidents(ctx context.Context, orgID int64, limit int) ([]models.UptimeIncident, error) { + if limit <= 0 { + limit = 20 + } + rows, err := s.db.QueryContext(ctx, ` + SELECT i.id, i.monitor_id, COALESCE(m.name, ''), COALESCE(n.name, ''), COALESCE(g.name, ''), + i.error_message, i.started_at, i.ended_at + FROM uptime_incidents i + INNER JOIN uptime_monitors m ON m.id = i.monitor_id + LEFT JOIN nodes n ON n.id = m.node_id + LEFT JOIN vm_groups g ON g.id = n.group_id + WHERE m.organization_id = ? + ORDER BY i.started_at DESC + LIMIT ? + `, orgID, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + var incidents []models.UptimeIncident + for rows.Next() { + var incident models.UptimeIncident + if err := rows.Scan( + &incident.ID, &incident.MonitorID, &incident.MonitorName, &incident.NodeName, &incident.GroupName, + &incident.ErrorMessage, &incident.StartedAt, &incident.EndedAt, + ); err != nil { + return nil, err + } + incident.DurationSeconds = incidentDurationSeconds(incident.StartedAt, incident.EndedAt) + incident.DurationText = humanDuration(time.Duration(incident.DurationSeconds) * time.Second) + incidents = append(incidents, incident) + } + return incidents, rows.Err() +} + +func (s *NodeService) UptimePeriodSummary(ctx context.Context, orgID int64, since *time.Time) (models.UptimePeriodSummary, error) { + var summary models.UptimePeriodSummary + args := []any{orgID} + filter := "" + if since != nil { + filter = " AND c.checked_at >= ?" + args = append(args, *since) + } + + if err := s.db.QueryRowContext(ctx, ` + SELECT + COUNT(*), + COALESCE(SUM(CASE WHEN c.status = 'up' THEN 1 ELSE 0 END), 0), + COALESCE(SUM(CASE WHEN c.status = 'down' THEN 1 ELSE 0 END), 0), + COALESCE(CAST(AVG(CASE WHEN c.status = 'up' THEN c.latency_ms END) AS INTEGER), 0), + COALESCE(SUM(CASE WHEN c.status = 'down' THEN m.interval_seconds ELSE 0 END), 0) + FROM uptime_checks c + INNER JOIN uptime_monitors m ON m.id = c.monitor_id + WHERE m.organization_id = ?`+filter, args...).Scan( + &summary.TotalChecks, &summary.UpChecks, &summary.DownChecks, &summary.AvgLatencyMS, &summary.DowntimeSeconds, + ); err != nil { + return summary, err + } + + incidentArgs := []any{orgID} + incidentFilter := "" + if since != nil { + incidentFilter = " AND i.started_at >= ?" + incidentArgs = append(incidentArgs, *since) + } + + var longest sql.NullInt64 + var avg sql.NullFloat64 + if err := s.db.QueryRowContext(ctx, ` + SELECT + COUNT(*), + MAX(CAST((strftime('%s', COALESCE(i.ended_at, CURRENT_TIMESTAMP)) - strftime('%s', i.started_at)) AS INTEGER)), + AVG(CAST((strftime('%s', COALESCE(i.ended_at, CURRENT_TIMESTAMP)) - strftime('%s', i.started_at)) AS INTEGER)) + FROM uptime_incidents i + INNER JOIN uptime_monitors m ON m.id = i.monitor_id + WHERE m.organization_id = ?`+incidentFilter, incidentArgs...).Scan( + &summary.IncidentCount, &longest, &avg, + ); err != nil { + return summary, err + } + if longest.Valid { + summary.LongestIncidentSeconds = longest.Int64 + } + if avg.Valid { + summary.AvgIncidentSeconds = int64(avg.Float64) + } + return summary, nil +} + +func (s *NodeService) RunAllUptimeChecks(ctx context.Context, orgID int64) error { + monitors, err := s.ListUptimeMonitors(ctx, orgID) + if err != nil { + return err + } + for i := range monitors { + if !monitors[i].Enabled { + continue + } + _ = s.RunUptimeCheck(ctx, &monitors[i]) + } + return nil +} + +func (s *NodeService) RunUptimeCheck(ctx context.Context, monitor *models.UptimeMonitor) error { + target := strings.TrimSpace(monitor.Target) + if target == "" { + return fmt.Errorf("empty monitor target") + } + + startedAt := time.Now() + timeout := 5 * time.Second + conn, err := net.DialTimeout("tcp", target, timeout) + latencyMS := int64(time.Since(startedAt).Milliseconds()) + status := "up" + errorMessage := "" + if err != nil { + status = "down" + errorMessage = err.Error() + } else { + _ = conn.Close() + } + + if latencyMS < 0 { + latencyMS = 0 + } + if _, execErr := s.db.ExecContext(ctx, ` + INSERT INTO uptime_checks (monitor_id, status, latency_ms, error_message, checked_at) + VALUES (?, ?, ?, ?, ?) + `, monitor.ID, status, latencyMS, errorMessage, startedAt); execErr != nil { + return execErr + } + + if status == "down" && monitor.LastStatus != "down" { + if _, execErr := s.db.ExecContext(ctx, ` + INSERT INTO uptime_incidents (monitor_id, error_message, started_at) + VALUES (?, ?, ?) + `, monitor.ID, errorMessage, startedAt); execErr != nil { + return execErr + } + } + + if status == "up" && monitor.LastStatus == "down" { + if _, execErr := s.db.ExecContext(ctx, ` + UPDATE uptime_incidents + SET ended_at = ? + WHERE id = ( + SELECT id + FROM uptime_incidents + WHERE monitor_id = ? AND ended_at IS NULL + ORDER BY started_at DESC + LIMIT 1 + ) + `, startedAt, monitor.ID); execErr != nil { + return execErr + } + } + + var upSinceAt any + var outageStartedAt any + if status == "up" { + if monitor.LastStatus == "up" && monitor.UpSinceAt != nil { + upSinceAt = *monitor.UpSinceAt + } else { + upSinceAt = startedAt + } + outageStartedAt = nil + } else { + if monitor.LastStatus == "down" && monitor.CurrentOutageStartedAt != nil { + outageStartedAt = *monitor.CurrentOutageStartedAt + } else { + outageStartedAt = startedAt + } + upSinceAt = nil + } + + _, err = s.db.ExecContext(ctx, ` + UPDATE uptime_monitors + SET last_status = ?, last_latency_ms = ?, last_checked_at = ?, last_error = ?, + up_since_at = ?, current_outage_started_at = ?, updated_at = CURRENT_TIMESTAMP + WHERE id = ? + `, status, latencyMS, startedAt, errorMessage, upSinceAt, outageStartedAt, monitor.ID) + if err != nil { + return err + } + + monitor.LastStatus = status + monitor.LastLatencyMS = latencyMS + monitor.LastCheckedAt = &startedAt + monitor.LastError = errorMessage + if status == "up" { + if upTime, ok := upSinceAt.(time.Time); ok { + monitor.UpSinceAt = &upTime + } + monitor.CurrentOutageStartedAt = nil + } else { + if downTime, ok := outageStartedAt.(time.Time); ok { + monitor.CurrentOutageStartedAt = &downTime + } + monitor.UpSinceAt = nil + } + + return nil +} + func sendMagicPacket(macAddress string) error { hw, err := net.ParseMAC(macAddress) if err != nil { @@ -556,6 +934,10 @@ func NewSchedulerService(database *sql.DB, nodeService *NodeService) *SchedulerS } func (s *SchedulerService) Start(ctx context.Context, orgID int64, refreshSpec string) error { + if err := s.nodeService.EnsureUptimeMonitors(ctx, orgID); err != nil { + return err + } + if _, err := s.cron.AddFunc(refreshSpec, func() { nodes, err := s.nodeService.ListNodes(ctx, orgID) if err != nil { @@ -571,6 +953,13 @@ func (s *SchedulerService) Start(ctx context.Context, orgID int64, refreshSpec s return err } + if _, err := s.cron.AddFunc("@every 1m", func() { + _ = s.nodeService.EnsureUptimeMonitors(context.Background(), orgID) + _ = s.nodeService.RunAllUptimeChecks(context.Background(), orgID) + }); err != nil { + return err + } + jobs, err := s.nodeService.ListAutomations(ctx, orgID) if err != nil { return err @@ -611,10 +1000,16 @@ func (s *SchedulerService) runAutomation(ctx context.Context, job models.Automat } finishedAt := time.Now() - _, _ = s.db.ExecContext(ctx, ` - INSERT INTO command_runs (job_id, node_id, action, status, output, started_at, finished_at) - VALUES (?, ?, ?, ?, ?, ?, ?) - `, job.ID, node.ID, job.Name, status, output, startedAt, finishedAt) + s.nodeService.logCommandRun(ctx, commandRunParams{ + JobID: &job.ID, + NodeID: node.ID, + Action: job.Name, + CommandText: sanitizeCommand(job.Command), + Status: status, + Output: output, + StartedAt: &startedAt, + FinishedAt: &finishedAt, + }) lastRunAt = finishedAt } @@ -673,3 +1068,95 @@ func formatDuration(startedAt time.Time, finishedAt *time.Time) string { } return fmt.Sprintf("%dm %ds", minutes, seconds) } + +type commandRunParams struct { + JobID *int64 + NodeID int64 + Action string + CommandText string + Status string + Output string + TriggeredBy *int64 + StartedAt *time.Time + FinishedAt *time.Time +} + +func (s *NodeService) logCommandRun(ctx context.Context, params commandRunParams) { + startedAt := time.Now() + if params.StartedAt != nil { + startedAt = *params.StartedAt + } + finishedAt := params.FinishedAt + if finishedAt == nil { + value := time.Now() + finishedAt = &value + } + + _, _ = s.db.ExecContext(ctx, ` + INSERT INTO command_runs (job_id, node_id, action, command_text, status, output, triggered_by, started_at, finished_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + `, params.JobID, params.NodeID, params.Action, params.CommandText, params.Status, params.Output, params.TriggeredBy, startedAt, finishedAt) +} + +func sanitizeCommand(command string) string { + trimmed := strings.TrimSpace(command) + if trimmed == "" { + return "" + } + + patterns := []*regexp.Regexp{ + regexp.MustCompile(`(?i)(--password(?:=|\s+))(\S+)`), + regexp.MustCompile(`(?i)(--token(?:=|\s+))(\S+)`), + regexp.MustCompile(`(?i)(--secret(?:=|\s+))(\S+)`), + regexp.MustCompile(`(?i)\b(password|passwd|token|secret|api[_-]?key)\s*=\s*(['"]?)[^'"\s]+(['"]?)`), + } + + sanitized := trimmed + for _, pattern := range patterns { + sanitized = pattern.ReplaceAllString(sanitized, `$1[REDACTED]`) + } + return sanitized +} + +func incidentDurationSeconds(startedAt time.Time, endedAt *time.Time) int64 { + end := time.Now() + if endedAt != nil { + end = *endedAt + } + if end.Before(startedAt) { + return 0 + } + return int64(end.Sub(startedAt).Seconds()) +} + +func humanDuration(duration time.Duration) string { + if duration < 0 { + duration = 0 + } + duration = duration.Round(time.Second) + days := int(duration / (24 * time.Hour)) + duration -= time.Duration(days) * 24 * time.Hour + hours := int(duration / time.Hour) + duration -= time.Duration(hours) * time.Hour + minutes := int(duration / time.Minute) + duration -= time.Duration(minutes) * time.Minute + seconds := int(duration / time.Second) + + parts := make([]string, 0, 4) + if days > 0 { + parts = append(parts, fmt.Sprintf("%dd", days)) + } + if hours > 0 { + parts = append(parts, fmt.Sprintf("%dh", hours)) + } + if minutes > 0 { + parts = append(parts, fmt.Sprintf("%dm", minutes)) + } + if seconds > 0 || len(parts) == 0 { + parts = append(parts, fmt.Sprintf("%ds", seconds)) + } + if len(parts) > 2 { + parts = parts[:2] + } + return strings.Join(parts, " ") +} diff --git a/internal/views/pages/settings.gohtml b/internal/views/pages/settings.gohtml index 2e1f129..5105e44 100644 --- a/internal/views/pages/settings.gohtml +++ b/internal/views/pages/settings.gohtml @@ -4,7 +4,7 @@
Themes

Appearance

-

Five defaults. Colored themes work in dark or light.

+

Dark or light only.

@@ -14,18 +14,9 @@

Theme Presets

-
- -
- - - - -
-
- +
-
+
-
+
-
- - -
-
- - -
-
- - -
@@ -90,4 +57,40 @@
+ +
+
+
+

Command History

+
+ + + + + + + + + + + {{if $data.Runs}} + {{range $data.Runs}} + + + + + + + {{end}} + {{else}} + + + + {{end}} + +
Group/VMidTimeStatusCommand
{{if .GroupName}}{{.GroupName}} / {{end}}{{.NodeName}}{{.StartedAt.Format "2006-01-02 15:04:05"}}{{.Status}}{{if .CommandText}}{{.CommandText}}{{else}}{{.Action}}{{end}}
No command history yet.
+
+
+
+
{{end}} diff --git a/internal/views/pages/uptime.gohtml b/internal/views/pages/uptime.gohtml new file mode 100644 index 0000000..0659360 --- /dev/null +++ b/internal/views/pages/uptime.gohtml @@ -0,0 +1,182 @@ +{{define "content"}} +{{$data := .Content}} +
+
+
+
+
Monitors
+
{{$data.Summary.TotalMonitors}}
+
+
+
+
+
+
+
Up
+
{{$data.Summary.UpMonitors}}
+
+
+
+
+
+
+
Down
+
{{$data.Summary.DownMonitors}}
+
+
+
+
+
+
+
Avg latency
+
{{if $data.Summary.AvgLatencyMS}}{{$data.Summary.AvgLatencyMS}}ms{{else}}-{{end}}
+
+
+
+
+ +
+
+
+
+
+

Monitors

+ SSH endpoint checks +
+
+ {{if $data.Monitors}} + {{range $data.Monitors}} +
+
+
+
+
{{.Monitor.Name}}
+
{{.Monitor.Target}}{{if .Monitor.GroupName}} · {{.Monitor.GroupName}}{{end}}
+
+ + {{if eq .Monitor.LastStatus "down"}}Down{{else if eq .Monitor.LastStatus "up"}}Up{{else}}Pending{{end}} + +
+ +
+
+ Availability + {{.AvailabilityText}} +
+
+ Latency + {{if .Monitor.LastLatencyMS}}{{.Monitor.LastLatencyMS}}ms{{else}}-{{end}} +
+
+ Checked + {{.LastCheckedText}} +
+
+ Interval + {{.IntervalText}} +
+
+ + + +
+ {{.StateDurationText}} + {{if .Monitor.LastError}}{{.Monitor.LastError}}{{else}}{{.Monitor.NodeName}}{{end}} +
+
+
+ {{end}} + {{else}} +
+
No monitors yet. Add a VM to start tracking uptime.
+
+ {{end}} +
+
+
+
+ +
+
+
+

Availability

+
+ + + + + + + + + + + + + {{range $data.Periods}} + + + + + + + + + {{end}} + +
PeriodAvailabilityDowntimeIncidentsLongestAvg
{{.Label}}{{.AvailabilityText}}{{.DowntimeText}}{{.Incidents}}{{.LongestText}}{{.AverageText}}
+
+
+
+
+
+ +
+
+
+
+

Incidents

+ {{len $data.Incidents}} recent +
+
+ + + + + + + + + + + + {{if $data.Incidents}} + {{range $data.Incidents}} + + + + + + + + {{end}} + {{else}} + + + + {{end}} + +
MonitorStartedEndedDurationError
{{.MonitorName}}{{.StartedAt.Format "2006-01-02 15:04:05"}}{{if .EndedAt}}{{.EndedAt.Format "2006-01-02 15:04:05"}}{{else}}Active{{end}}{{.DurationText}}{{if .ErrorMessage}}{{.ErrorMessage}}{{else}}Connection failed{{end}}
No incidents recorded yet.
+
+
+
+
+{{end}}