feat(core): add uptime monitoring and command history
This commit is contained in:
@@ -96,6 +96,7 @@ func New() (*App, error) {
|
||||
protected.Get("/nodes/{nodeID}/console/ws", handler.NodeConsoleWebSocket)
|
||||
protected.Get("/groups", handler.GroupsPage)
|
||||
protected.Get("/automations", handler.AutomationsPage)
|
||||
protected.Get("/uptime", handler.UptimePage)
|
||||
protected.Get("/settings", handler.SettingsPage)
|
||||
protected.Post("/settings/theme", handler.UpdateTheme)
|
||||
|
||||
@@ -107,6 +108,7 @@ func New() (*App, error) {
|
||||
editor.Post("/nodes/{nodeID}/commands", handler.NodeQuickCommand)
|
||||
editor.Post("/nodes/{nodeID}/delete", handler.DeleteNode)
|
||||
editor.Post("/automations", handler.CreateAutomation)
|
||||
editor.Post("/uptime/run", handler.RunUptimeChecks)
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ func Load() Config {
|
||||
EncryptionKey: env("MAINTAINARR_ENCRYPTION_KEY", "change-me-encryption-key-32bytes"),
|
||||
OrgName: env("MAINTAINARR_ORG_NAME", "Maintainarr"),
|
||||
BaseURL: env("MAINTAINARR_BASE_URL", "http://localhost:8080"),
|
||||
DefaultTheme: env("MAINTAINARR_THEME", "blue"),
|
||||
DefaultTheme: env("MAINTAINARR_THEME", "dark"),
|
||||
DefaultMode: env("MAINTAINARR_THEME_MODE", "dark"),
|
||||
RefreshCron: env("MAINTAINARR_REFRESH_CRON", "@every 5s"),
|
||||
}
|
||||
|
||||
@@ -127,6 +127,7 @@ func migrate(ctx context.Context, database *sql.DB) error {
|
||||
job_id INTEGER,
|
||||
node_id INTEGER NOT NULL,
|
||||
action TEXT NOT NULL,
|
||||
command_text TEXT NOT NULL DEFAULT '',
|
||||
status TEXT NOT NULL,
|
||||
output TEXT NOT NULL DEFAULT '',
|
||||
triggered_by INTEGER,
|
||||
@@ -136,6 +137,44 @@ func migrate(ctx context.Context, database *sql.DB) error {
|
||||
FOREIGN KEY (node_id) REFERENCES nodes(id),
|
||||
FOREIGN KEY (triggered_by) REFERENCES users(id)
|
||||
);`,
|
||||
`CREATE TABLE IF NOT EXISTS uptime_monitors (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
organization_id INTEGER NOT NULL,
|
||||
node_id INTEGER NOT NULL UNIQUE,
|
||||
name TEXT NOT NULL,
|
||||
target TEXT NOT NULL,
|
||||
monitor_type TEXT NOT NULL DEFAULT 'ssh',
|
||||
interval_seconds INTEGER NOT NULL DEFAULT 60,
|
||||
enabled BOOLEAN NOT NULL DEFAULT 1,
|
||||
last_status TEXT NOT NULL DEFAULT 'unknown',
|
||||
last_latency_ms INTEGER NOT NULL DEFAULT 0,
|
||||
last_checked_at DATETIME,
|
||||
last_error TEXT NOT NULL DEFAULT '',
|
||||
up_since_at DATETIME,
|
||||
current_outage_started_at DATETIME,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (organization_id) REFERENCES organizations(id),
|
||||
FOREIGN KEY (node_id) REFERENCES nodes(id)
|
||||
);`,
|
||||
`CREATE TABLE IF NOT EXISTS uptime_checks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
monitor_id INTEGER NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
latency_ms INTEGER NOT NULL DEFAULT 0,
|
||||
error_message TEXT NOT NULL DEFAULT '',
|
||||
checked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (monitor_id) REFERENCES uptime_monitors(id)
|
||||
);`,
|
||||
`CREATE TABLE IF NOT EXISTS uptime_incidents (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
monitor_id INTEGER NOT NULL,
|
||||
error_message TEXT NOT NULL DEFAULT '',
|
||||
started_at DATETIME NOT NULL,
|
||||
ended_at DATETIME,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (monitor_id) REFERENCES uptime_monitors(id)
|
||||
);`,
|
||||
}
|
||||
|
||||
for _, statement := range statements {
|
||||
@@ -158,6 +197,9 @@ func migrate(ctx context.Context, database *sql.DB) error {
|
||||
`ALTER TABLE nodes ADD COLUMN memory_total_mb INTEGER NOT NULL DEFAULT 0;`,
|
||||
`ALTER TABLE nodes ADD COLUMN disk_total_gb INTEGER NOT NULL DEFAULT 0;`,
|
||||
`ALTER TABLE automation_jobs ADD COLUMN tag TEXT NOT NULL DEFAULT '';`,
|
||||
`ALTER TABLE command_runs ADD COLUMN command_text TEXT NOT NULL DEFAULT '';`,
|
||||
`ALTER TABLE uptime_monitors ADD COLUMN last_error TEXT NOT NULL DEFAULT '';`,
|
||||
`ALTER TABLE uptime_monitors ADD COLUMN up_since_at DATETIME;`,
|
||||
}
|
||||
|
||||
for _, statement := range alterStatements {
|
||||
|
||||
@@ -57,6 +57,7 @@ type settingsData struct {
|
||||
ThemeVariables template.CSS
|
||||
CurrentTheme string
|
||||
CurrentMode string
|
||||
Runs []models.CommandRun
|
||||
}
|
||||
|
||||
type jobsPageData struct {
|
||||
@@ -67,6 +68,39 @@ type jobsPageData struct {
|
||||
Runs []models.CommandRun
|
||||
}
|
||||
|
||||
type uptimePageData struct {
|
||||
Summary uptimeSummary
|
||||
Periods []uptimePeriodRow
|
||||
Monitors []uptimeMonitorCard
|
||||
Incidents []models.UptimeIncident
|
||||
}
|
||||
|
||||
type uptimeSummary struct {
|
||||
TotalMonitors int
|
||||
UpMonitors int
|
||||
DownMonitors int
|
||||
AvgLatencyMS int64
|
||||
}
|
||||
|
||||
type uptimePeriodRow struct {
|
||||
Label string
|
||||
AvailabilityText string
|
||||
DowntimeText string
|
||||
Incidents int64
|
||||
LongestText string
|
||||
AverageText string
|
||||
}
|
||||
|
||||
type uptimeMonitorCard struct {
|
||||
Monitor models.UptimeMonitor
|
||||
Availability float64
|
||||
AvailabilityText string
|
||||
LastCheckedText string
|
||||
StateDurationText string
|
||||
IntervalText string
|
||||
RecentChecks []models.UptimeCheck
|
||||
}
|
||||
|
||||
func New(repo *services.Repository, auth *services.AuthService, sessions *services.SessionService, nodes *services.NodeService, renderer *views.Renderer, org models.Organization, baseURL string) *Handler {
|
||||
return &Handler{
|
||||
repo: repo,
|
||||
@@ -336,11 +370,16 @@ func (h *Handler) CreateNode(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "failed to create node", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if err := h.nodes.EnsureUptimeMonitorForNode(r.Context(), node); err != nil {
|
||||
http.Error(w, "failed to create node monitor", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if node.SSHUsername != "" && node.SSHPassword != "" {
|
||||
_, _ = h.nodes.RefreshNodeInventory(r.Context(), node)
|
||||
_, _ = h.nodes.RefreshNodeStats(r.Context(), node)
|
||||
}
|
||||
_ = h.nodes.RunAllUptimeChecks(r.Context(), h.org.ID)
|
||||
|
||||
http.Redirect(w, r, "/dashboard", http.StatusSeeOther)
|
||||
}
|
||||
@@ -710,13 +749,33 @@ func (h *Handler) CreateAutomation(w http.ResponseWriter, r *http.Request) {
|
||||
func (h *Handler) SettingsPage(w http.ResponseWriter, r *http.Request) {
|
||||
user := localmiddleware.CurrentUser(r)
|
||||
org := h.currentOrganization(r.Context())
|
||||
currentMode := normalizeMode(org.ThemeMode)
|
||||
currentTheme := normalizeTheme(org.Theme, currentMode)
|
||||
h.render(w, r, "settings", "Theme System", settingsData{
|
||||
ThemeVariables: template.CSS(themePreview),
|
||||
CurrentTheme: org.Theme,
|
||||
CurrentMode: org.ThemeMode,
|
||||
CurrentTheme: currentTheme,
|
||||
CurrentMode: currentMode,
|
||||
Runs: h.settingsRuns(r.Context()),
|
||||
}, user)
|
||||
}
|
||||
|
||||
func (h *Handler) UptimePage(w http.ResponseWriter, r *http.Request) {
|
||||
user := localmiddleware.CurrentUser(r)
|
||||
h.render(w, r, "uptime", "Uptime", h.uptimePageData(r.Context()), user)
|
||||
}
|
||||
|
||||
func (h *Handler) RunUptimeChecks(w http.ResponseWriter, r *http.Request) {
|
||||
if err := h.nodes.EnsureUptimeMonitors(r.Context(), h.org.ID); err != nil {
|
||||
http.Error(w, "failed to sync monitors", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if err := h.nodes.RunAllUptimeChecks(r.Context(), h.org.ID); err != nil {
|
||||
http.Error(w, "failed to run uptime checks", http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
http.Redirect(w, r, "/uptime", http.StatusSeeOther)
|
||||
}
|
||||
|
||||
func (h *Handler) UpdateTheme(w http.ResponseWriter, r *http.Request) {
|
||||
if err := r.ParseForm(); err != nil {
|
||||
http.Error(w, "bad request", http.StatusBadRequest)
|
||||
@@ -725,6 +784,9 @@ func (h *Handler) UpdateTheme(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
theme := strings.TrimSpace(r.FormValue("theme"))
|
||||
mode := strings.TrimSpace(r.FormValue("mode"))
|
||||
if mode == "" {
|
||||
mode = theme
|
||||
}
|
||||
if !isAllowedTheme(theme) || !isAllowedMode(mode) {
|
||||
http.Error(w, "invalid theme selection", http.StatusBadRequest)
|
||||
return
|
||||
@@ -750,12 +812,14 @@ func (h *Handler) render(w http.ResponseWriter, r *http.Request, page, title str
|
||||
org := h.currentOrganization(r.Context())
|
||||
groups, _ := h.repo.ListGroups(r.Context(), org.ID)
|
||||
tags, _ := h.repo.ListTags(r.Context(), org.ID)
|
||||
mode := normalizeMode(org.ThemeMode)
|
||||
theme := normalizeTheme(org.Theme, mode)
|
||||
|
||||
h.renderer.Render(w, page, views.ViewData{
|
||||
Title: title,
|
||||
Shell: shellForPage(page),
|
||||
ThemeClass: org.Theme,
|
||||
ThemeMode: org.ThemeMode,
|
||||
ThemeClass: theme,
|
||||
ThemeMode: mode,
|
||||
CurrentPath: r.URL.Path,
|
||||
User: currentUser,
|
||||
Organization: org,
|
||||
@@ -797,6 +861,75 @@ func (h *Handler) dashboardData(ctx context.Context) dashboardData {
|
||||
}
|
||||
}
|
||||
|
||||
func (h *Handler) uptimePageData(ctx context.Context) uptimePageData {
|
||||
_ = h.nodes.EnsureUptimeMonitors(ctx, h.org.ID)
|
||||
|
||||
monitors, _ := h.nodes.ListUptimeMonitors(ctx, h.org.ID)
|
||||
checksByMonitor, _ := h.nodes.ListRecentUptimeChecks(ctx, h.org.ID, 24)
|
||||
incidents, _ := h.nodes.ListRecentUptimeIncidents(ctx, h.org.ID, 20)
|
||||
|
||||
summary := uptimeSummary{
|
||||
TotalMonitors: len(monitors),
|
||||
}
|
||||
var latencyTotal int64
|
||||
var latencyCount int64
|
||||
cards := make([]uptimeMonitorCard, 0, len(monitors))
|
||||
for _, monitor := range monitors {
|
||||
if monitor.LastStatus == "down" {
|
||||
summary.DownMonitors++
|
||||
} else if monitor.LastStatus == "up" {
|
||||
summary.UpMonitors++
|
||||
}
|
||||
if monitor.LastStatus == "up" && monitor.LastLatencyMS > 0 {
|
||||
latencyTotal += monitor.LastLatencyMS
|
||||
latencyCount++
|
||||
}
|
||||
|
||||
recentChecks := checksByMonitor[monitor.ID]
|
||||
card := uptimeMonitorCard{
|
||||
Monitor: monitor,
|
||||
Availability: availabilityForChecks(recentChecks),
|
||||
AvailabilityText: fmt.Sprintf("%.2f%%", availabilityForChecks(recentChecks)),
|
||||
LastCheckedText: relativeTime(monitor.LastCheckedAt),
|
||||
StateDurationText: monitorStateDuration(monitor),
|
||||
IntervalText: humanizeInterval(monitor.IntervalSeconds),
|
||||
RecentChecks: reverseChecks(recentChecks),
|
||||
}
|
||||
cards = append(cards, card)
|
||||
}
|
||||
if latencyCount > 0 {
|
||||
summary.AvgLatencyMS = latencyTotal / latencyCount
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
periods := []uptimePeriodRow{
|
||||
h.periodRow(ctx, "Today", ptrTime(now.Add(-24*time.Hour))),
|
||||
h.periodRow(ctx, "Last 7 days", ptrTime(now.Add(-7*24*time.Hour))),
|
||||
h.periodRow(ctx, "Last 30 days", ptrTime(now.Add(-30*24*time.Hour))),
|
||||
h.periodRow(ctx, "Last 365 days", ptrTime(now.Add(-365*24*time.Hour))),
|
||||
h.periodRow(ctx, "All time", nil),
|
||||
}
|
||||
|
||||
return uptimePageData{
|
||||
Summary: summary,
|
||||
Periods: periods,
|
||||
Monitors: cards,
|
||||
Incidents: incidents,
|
||||
}
|
||||
}
|
||||
|
||||
func (h *Handler) periodRow(ctx context.Context, label string, since *time.Time) uptimePeriodRow {
|
||||
summary, _ := h.nodes.UptimePeriodSummary(ctx, h.org.ID, since)
|
||||
return uptimePeriodRow{
|
||||
Label: label,
|
||||
AvailabilityText: availabilityText(summary),
|
||||
DowntimeText: humanizeSeconds(summary.DowntimeSeconds),
|
||||
Incidents: summary.IncidentCount,
|
||||
LongestText: humanizeSeconds(summary.LongestIncidentSeconds),
|
||||
AverageText: humanizeSeconds(summary.AvgIncidentSeconds),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *Handler) currentOrganization(ctx context.Context) models.Organization {
|
||||
org, err := h.repo.GetOrganization(ctx)
|
||||
if err != nil {
|
||||
@@ -805,6 +938,109 @@ func (h *Handler) currentOrganization(ctx context.Context) models.Organization {
|
||||
return org
|
||||
}
|
||||
|
||||
func availabilityForChecks(checks []models.UptimeCheck) float64 {
|
||||
if len(checks) == 0 {
|
||||
return 0
|
||||
}
|
||||
var up int
|
||||
for _, check := range checks {
|
||||
if check.Status == "up" {
|
||||
up++
|
||||
}
|
||||
}
|
||||
return float64(up) * 100 / float64(len(checks))
|
||||
}
|
||||
|
||||
func availabilityText(summary models.UptimePeriodSummary) string {
|
||||
if summary.TotalChecks == 0 {
|
||||
return "0.00%"
|
||||
}
|
||||
return fmt.Sprintf("%.2f%%", float64(summary.UpChecks)*100/float64(summary.TotalChecks))
|
||||
}
|
||||
|
||||
func relativeTime(value *time.Time) string {
|
||||
if value == nil {
|
||||
return "Never"
|
||||
}
|
||||
diff := time.Since(*value).Round(time.Second)
|
||||
if diff < time.Minute {
|
||||
seconds := int(diff.Seconds())
|
||||
if seconds < 1 {
|
||||
seconds = 1
|
||||
}
|
||||
return fmt.Sprintf("%ds ago", seconds)
|
||||
}
|
||||
if diff < time.Hour {
|
||||
return fmt.Sprintf("%dm ago", int(diff.Minutes()))
|
||||
}
|
||||
if diff < 24*time.Hour {
|
||||
return fmt.Sprintf("%dh ago", int(diff.Hours()))
|
||||
}
|
||||
return value.Format("2006-01-02 15:04")
|
||||
}
|
||||
|
||||
func humanizeInterval(seconds int64) string {
|
||||
if seconds <= 0 {
|
||||
return "-"
|
||||
}
|
||||
if seconds%3600 == 0 {
|
||||
hours := seconds / 3600
|
||||
if hours == 1 {
|
||||
return "Every hour"
|
||||
}
|
||||
return fmt.Sprintf("Every %d hours", hours)
|
||||
}
|
||||
if seconds%60 == 0 {
|
||||
minutes := seconds / 60
|
||||
if minutes == 1 {
|
||||
return "Every minute"
|
||||
}
|
||||
return fmt.Sprintf("Every %d minutes", minutes)
|
||||
}
|
||||
return fmt.Sprintf("Every %d seconds", seconds)
|
||||
}
|
||||
|
||||
func monitorStateDuration(monitor models.UptimeMonitor) string {
|
||||
if monitor.LastStatus == "up" && monitor.UpSinceAt != nil {
|
||||
return "Up for " + humanizeSeconds(int64(time.Since(*monitor.UpSinceAt).Seconds()))
|
||||
}
|
||||
if monitor.LastStatus == "down" && monitor.CurrentOutageStartedAt != nil {
|
||||
return "Down for " + humanizeSeconds(int64(time.Since(*monitor.CurrentOutageStartedAt).Seconds()))
|
||||
}
|
||||
return "Awaiting checks"
|
||||
}
|
||||
|
||||
func humanizeSeconds(seconds int64) string {
|
||||
if seconds <= 0 {
|
||||
return "-"
|
||||
}
|
||||
duration := time.Duration(seconds) * time.Second
|
||||
days := int(duration / (24 * time.Hour))
|
||||
duration -= time.Duration(days) * 24 * time.Hour
|
||||
hours := int(duration / time.Hour)
|
||||
duration -= time.Duration(hours) * time.Hour
|
||||
minutes := int(duration / time.Minute)
|
||||
if days > 0 {
|
||||
return fmt.Sprintf("%dd %dh %dm", days, hours, minutes)
|
||||
}
|
||||
if hours > 0 {
|
||||
return fmt.Sprintf("%dh %dm", hours, minutes)
|
||||
}
|
||||
return fmt.Sprintf("%dm", minutes)
|
||||
}
|
||||
|
||||
func ptrTime(value time.Time) *time.Time {
|
||||
return &value
|
||||
}
|
||||
|
||||
func reverseChecks(checks []models.UptimeCheck) []models.UptimeCheck {
|
||||
reversed := make([]models.UptimeCheck, len(checks))
|
||||
for i := range checks {
|
||||
reversed[len(checks)-1-i] = checks[i]
|
||||
}
|
||||
return reversed
|
||||
}
|
||||
|
||||
func shellForPage(page string) string {
|
||||
switch page {
|
||||
case "login", "login_otp", "register":
|
||||
@@ -826,7 +1062,7 @@ func (h *Handler) listRuns(ctx context.Context, nodeID int64) []models.CommandRu
|
||||
var runs []models.CommandRun
|
||||
for rows.Next() {
|
||||
var run models.CommandRun
|
||||
if err := rows.Scan(&run.ID, &run.JobID, &run.NodeID, &run.Action, &run.Status, &run.Output, &run.TriggeredBy, &run.StartedAt, &run.FinishedAt); err == nil {
|
||||
if err := rows.Scan(&run.ID, &run.JobID, &run.NodeID, &run.Action, &run.CommandText, &run.Status, &run.Output, &run.TriggeredBy, &run.StartedAt, &run.FinishedAt); err == nil {
|
||||
runs = append(runs, run)
|
||||
}
|
||||
}
|
||||
@@ -835,7 +1071,7 @@ func (h *Handler) listRuns(ctx context.Context, nodeID int64) []models.CommandRu
|
||||
|
||||
func (h *Handler) nodesRunQuery(ctx context.Context, nodeID int64) (*sql.Rows, error) {
|
||||
return h.nodesDB().QueryContext(ctx, `
|
||||
SELECT id, job_id, node_id, action, status, output, triggered_by, started_at, finished_at
|
||||
SELECT id, job_id, node_id, action, command_text, status, output, triggered_by, started_at, finished_at
|
||||
FROM command_runs
|
||||
WHERE node_id = ?
|
||||
ORDER BY started_at DESC
|
||||
@@ -843,6 +1079,14 @@ func (h *Handler) nodesRunQuery(ctx context.Context, nodeID int64) (*sql.Rows, e
|
||||
`, nodeID)
|
||||
}
|
||||
|
||||
func (h *Handler) settingsRuns(ctx context.Context) []models.CommandRun {
|
||||
runs, err := h.nodes.ListCommandHistory(ctx, h.org.ID)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return runs
|
||||
}
|
||||
|
||||
func (h *Handler) nodesDB() *sql.DB {
|
||||
return h.repo.DB()
|
||||
}
|
||||
@@ -905,7 +1149,7 @@ func defaultIfEmpty(value, fallback string) string {
|
||||
|
||||
func isAllowedTheme(value string) bool {
|
||||
switch value {
|
||||
case "dark", "light", "green", "red", "blue":
|
||||
case "dark", "light":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
@@ -916,6 +1160,20 @@ func isAllowedMode(value string) bool {
|
||||
return value == "dark" || value == "light"
|
||||
}
|
||||
|
||||
func normalizeTheme(theme, mode string) string {
|
||||
if isAllowedTheme(theme) {
|
||||
return theme
|
||||
}
|
||||
return normalizeMode(mode)
|
||||
}
|
||||
|
||||
func normalizeMode(mode string) string {
|
||||
if isAllowedMode(mode) {
|
||||
return mode
|
||||
}
|
||||
return "dark"
|
||||
}
|
||||
|
||||
func saveKeyUpload(r *http.Request, field string) (string, error) {
|
||||
file, header, err := r.FormFile(field)
|
||||
if err != nil {
|
||||
|
||||
@@ -98,7 +98,9 @@ type CommandRun struct {
|
||||
NodeID int64
|
||||
JobName string
|
||||
NodeName string
|
||||
GroupName string
|
||||
Action string
|
||||
CommandText string
|
||||
Status string
|
||||
Output string
|
||||
TriggeredBy *int64
|
||||
@@ -106,3 +108,57 @@ type CommandRun struct {
|
||||
FinishedAt *time.Time
|
||||
DurationText string
|
||||
}
|
||||
|
||||
type UptimeMonitor struct {
|
||||
ID int64
|
||||
OrganizationID int64
|
||||
NodeID int64
|
||||
NodeName string
|
||||
GroupName string
|
||||
Name string
|
||||
Target string
|
||||
MonitorType string
|
||||
IntervalSeconds int64
|
||||
Enabled bool
|
||||
LastStatus string
|
||||
LastLatencyMS int64
|
||||
LastCheckedAt *time.Time
|
||||
LastError string
|
||||
UpSinceAt *time.Time
|
||||
CurrentOutageStartedAt *time.Time
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type UptimeCheck struct {
|
||||
ID int64
|
||||
MonitorID int64
|
||||
Status string
|
||||
LatencyMS int64
|
||||
ErrorMessage string
|
||||
CheckedAt time.Time
|
||||
}
|
||||
|
||||
type UptimeIncident struct {
|
||||
ID int64
|
||||
MonitorID int64
|
||||
MonitorName string
|
||||
NodeName string
|
||||
GroupName string
|
||||
ErrorMessage string
|
||||
StartedAt time.Time
|
||||
EndedAt *time.Time
|
||||
DurationSeconds int64
|
||||
DurationText string
|
||||
}
|
||||
|
||||
type UptimePeriodSummary struct {
|
||||
TotalChecks int64
|
||||
UpChecks int64
|
||||
DownChecks int64
|
||||
AvgLatencyMS int64
|
||||
DowntimeSeconds int64
|
||||
IncidentCount int64
|
||||
LongestIncidentSeconds int64
|
||||
AvgIncidentSeconds int64
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"net"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -172,6 +173,39 @@ func (s *NodeService) SaveNode(ctx context.Context, node *models.Node) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *NodeService) EnsureUptimeMonitorForNode(ctx context.Context, node *models.Node) error {
|
||||
target := fmt.Sprintf("%s:%d", strings.TrimSpace(node.IPAddress), node.SSHPort)
|
||||
name := strings.TrimSpace(node.Name)
|
||||
if name == "" {
|
||||
name = target
|
||||
}
|
||||
|
||||
_, err := s.db.ExecContext(ctx, `
|
||||
INSERT INTO uptime_monitors (
|
||||
organization_id, node_id, name, target, monitor_type, interval_seconds, enabled
|
||||
) VALUES (?, ?, ?, ?, 'ssh', 60, 1)
|
||||
ON CONFLICT(node_id) DO UPDATE SET
|
||||
organization_id = excluded.organization_id,
|
||||
name = excluded.name,
|
||||
target = excluded.target,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
`, node.OrganizationID, node.ID, name, target)
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *NodeService) EnsureUptimeMonitors(ctx context.Context, orgID int64) error {
|
||||
nodes, err := s.ListNodes(ctx, orgID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for i := range nodes {
|
||||
if err := s.EnsureUptimeMonitorForNode(ctx, &nodes[i]); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *NodeService) DeleteNode(ctx context.Context, orgID, nodeID int64) error {
|
||||
tx, err := s.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
@@ -237,6 +271,13 @@ echo "UPTIME=${uptime:-0}"
|
||||
|
||||
output, err := s.RunSSHCommand(ctx, node, strings.TrimSpace(statsScript))
|
||||
if err != nil {
|
||||
s.logCommandRun(ctx, commandRunParams{
|
||||
NodeID: node.ID,
|
||||
Action: "refresh-stats",
|
||||
CommandText: sanitizeCommand(statsScript),
|
||||
Status: "failed",
|
||||
Output: strings.TrimSpace(output + "\n" + err.Error()),
|
||||
})
|
||||
return "", err
|
||||
}
|
||||
|
||||
@@ -270,6 +311,13 @@ echo "UPTIME=${uptime:-0}"
|
||||
node.DiskUsage = stats["DISK"]
|
||||
node.UptimeSeconds = int64(stats["UPTIME"])
|
||||
node.LastSeenAt = &now
|
||||
s.logCommandRun(ctx, commandRunParams{
|
||||
NodeID: node.ID,
|
||||
Action: "refresh-stats",
|
||||
CommandText: sanitizeCommand(statsScript),
|
||||
Status: "completed",
|
||||
Output: output,
|
||||
})
|
||||
|
||||
return output, nil
|
||||
}
|
||||
@@ -304,6 +352,15 @@ func (s *NodeService) RefreshNodeInventory(ctx context.Context, node *models.Nod
|
||||
`DISK_GB="$(df -BG / 2>/dev/null | awk 'NR==2 {gsub(/G/, "", $2); print $2}')"; echo DISK_GB="${DISK_GB:-0}"`,
|
||||
}, " ; "))
|
||||
if err != nil {
|
||||
s.logCommandRun(ctx, commandRunParams{
|
||||
NodeID: node.ID,
|
||||
Action: "refresh-inventory",
|
||||
CommandText: sanitizeCommand(strings.Join([]string{
|
||||
`read /etc/os-release, hostname, kernel, package manager, cpu, gpu, shell, package count, memory, disk`,
|
||||
}, "")),
|
||||
Status: "failed",
|
||||
Output: strings.TrimSpace(output + "\n" + err.Error()),
|
||||
})
|
||||
return "", err
|
||||
}
|
||||
|
||||
@@ -353,6 +410,14 @@ func (s *NodeService) RefreshNodeInventory(ctx context.Context, node *models.Nod
|
||||
return output, err
|
||||
}
|
||||
|
||||
s.logCommandRun(ctx, commandRunParams{
|
||||
NodeID: node.ID,
|
||||
Action: "refresh-inventory",
|
||||
CommandText: "inventory probe",
|
||||
Status: "completed",
|
||||
Output: output,
|
||||
})
|
||||
|
||||
return output, nil
|
||||
}
|
||||
|
||||
@@ -419,10 +484,14 @@ func (s *NodeService) RunAction(ctx context.Context, node *models.Node, action s
|
||||
status = "failed"
|
||||
}
|
||||
|
||||
_, _ = s.db.ExecContext(ctx, `
|
||||
INSERT INTO command_runs (node_id, action, status, output, triggered_by)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
`, node.ID, action, status, output, userID)
|
||||
s.logCommandRun(ctx, commandRunParams{
|
||||
NodeID: node.ID,
|
||||
Action: action,
|
||||
CommandText: sanitizeCommand(command),
|
||||
Status: status,
|
||||
Output: output,
|
||||
TriggeredBy: userID,
|
||||
})
|
||||
|
||||
return output, err
|
||||
}
|
||||
@@ -436,10 +505,16 @@ func (s *NodeService) RunAdHocCommand(ctx context.Context, node *models.Node, la
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
_, _ = s.db.ExecContext(ctx, `
|
||||
INSERT INTO command_runs (node_id, action, status, output, triggered_by, started_at, finished_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
`, node.ID, label, status, output, userID, now, now)
|
||||
s.logCommandRun(ctx, commandRunParams{
|
||||
NodeID: node.ID,
|
||||
Action: label,
|
||||
CommandText: sanitizeCommand(command),
|
||||
Status: status,
|
||||
Output: output,
|
||||
TriggeredBy: userID,
|
||||
StartedAt: &now,
|
||||
FinishedAt: &now,
|
||||
})
|
||||
|
||||
return output, err
|
||||
}
|
||||
@@ -487,11 +562,12 @@ func (s *NodeService) CreateAutomation(ctx context.Context, job *models.Automati
|
||||
|
||||
func (s *NodeService) ListJobRuns(ctx context.Context, orgID int64) ([]models.CommandRun, error) {
|
||||
rows, err := s.db.QueryContext(ctx, `
|
||||
SELECT cr.id, cr.job_id, cr.node_id, cr.action, cr.status, cr.output, cr.triggered_by,
|
||||
cr.started_at, cr.finished_at, COALESCE(j.name, ''), COALESCE(n.name, '')
|
||||
SELECT cr.id, cr.job_id, cr.node_id, cr.action, cr.command_text, cr.status, cr.output, cr.triggered_by,
|
||||
cr.started_at, cr.finished_at, COALESCE(j.name, ''), COALESCE(n.name, ''), COALESCE(g.name, '')
|
||||
FROM command_runs cr
|
||||
LEFT JOIN automation_jobs j ON j.id = cr.job_id
|
||||
LEFT JOIN nodes n ON n.id = cr.node_id
|
||||
LEFT JOIN vm_groups g ON g.id = n.group_id
|
||||
WHERE j.organization_id = ? OR (j.id IS NULL AND n.organization_id = ?)
|
||||
ORDER BY cr.started_at DESC
|
||||
LIMIT 50
|
||||
@@ -505,8 +581,8 @@ func (s *NodeService) ListJobRuns(ctx context.Context, orgID int64) ([]models.Co
|
||||
for rows.Next() {
|
||||
var run models.CommandRun
|
||||
if err := rows.Scan(
|
||||
&run.ID, &run.JobID, &run.NodeID, &run.Action, &run.Status, &run.Output, &run.TriggeredBy,
|
||||
&run.StartedAt, &run.FinishedAt, &run.JobName, &run.NodeName,
|
||||
&run.ID, &run.JobID, &run.NodeID, &run.Action, &run.CommandText, &run.Status, &run.Output, &run.TriggeredBy,
|
||||
&run.StartedAt, &run.FinishedAt, &run.JobName, &run.NodeName, &run.GroupName,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -517,6 +593,308 @@ func (s *NodeService) ListJobRuns(ctx context.Context, orgID int64) ([]models.Co
|
||||
return runs, rows.Err()
|
||||
}
|
||||
|
||||
func (s *NodeService) ListCommandHistory(ctx context.Context, orgID int64) ([]models.CommandRun, error) {
|
||||
rows, err := s.db.QueryContext(ctx, `
|
||||
SELECT cr.id, cr.job_id, cr.node_id, cr.action, cr.command_text, cr.status, cr.output, cr.triggered_by,
|
||||
cr.started_at, cr.finished_at, COALESCE(j.name, ''), COALESCE(n.name, ''), COALESCE(g.name, '')
|
||||
FROM command_runs cr
|
||||
LEFT JOIN automation_jobs j ON j.id = cr.job_id
|
||||
LEFT JOIN nodes n ON n.id = cr.node_id
|
||||
LEFT JOIN vm_groups g ON g.id = n.group_id
|
||||
WHERE n.organization_id = ? OR j.organization_id = ?
|
||||
ORDER BY cr.started_at DESC
|
||||
LIMIT 200
|
||||
`, orgID, orgID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var runs []models.CommandRun
|
||||
for rows.Next() {
|
||||
var run models.CommandRun
|
||||
if err := rows.Scan(
|
||||
&run.ID, &run.JobID, &run.NodeID, &run.Action, &run.CommandText, &run.Status, &run.Output, &run.TriggeredBy,
|
||||
&run.StartedAt, &run.FinishedAt, &run.JobName, &run.NodeName, &run.GroupName,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
run.DurationText = formatDuration(run.StartedAt, run.FinishedAt)
|
||||
runs = append(runs, run)
|
||||
}
|
||||
return runs, rows.Err()
|
||||
}
|
||||
|
||||
func (s *NodeService) ListUptimeMonitors(ctx context.Context, orgID int64) ([]models.UptimeMonitor, error) {
|
||||
rows, err := s.db.QueryContext(ctx, `
|
||||
SELECT m.id, m.organization_id, m.node_id, COALESCE(n.name, ''), COALESCE(g.name, ''),
|
||||
m.name, m.target, m.monitor_type, m.interval_seconds, m.enabled, m.last_status,
|
||||
m.last_latency_ms, m.last_checked_at, m.last_error, m.up_since_at, m.current_outage_started_at,
|
||||
m.created_at, m.updated_at
|
||||
FROM uptime_monitors m
|
||||
LEFT JOIN nodes n ON n.id = m.node_id
|
||||
LEFT JOIN vm_groups g ON g.id = n.group_id
|
||||
WHERE m.organization_id = ?
|
||||
ORDER BY m.name ASC
|
||||
`, orgID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var monitors []models.UptimeMonitor
|
||||
for rows.Next() {
|
||||
var monitor models.UptimeMonitor
|
||||
if err := rows.Scan(
|
||||
&monitor.ID, &monitor.OrganizationID, &monitor.NodeID, &monitor.NodeName, &monitor.GroupName,
|
||||
&monitor.Name, &monitor.Target, &monitor.MonitorType, &monitor.IntervalSeconds, &monitor.Enabled, &monitor.LastStatus,
|
||||
&monitor.LastLatencyMS, &monitor.LastCheckedAt, &monitor.LastError, &monitor.UpSinceAt, &monitor.CurrentOutageStartedAt,
|
||||
&monitor.CreatedAt, &monitor.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
monitors = append(monitors, monitor)
|
||||
}
|
||||
return monitors, rows.Err()
|
||||
}
|
||||
|
||||
func (s *NodeService) ListRecentUptimeChecks(ctx context.Context, orgID int64, limitPerMonitor int) (map[int64][]models.UptimeCheck, error) {
|
||||
if limitPerMonitor <= 0 {
|
||||
limitPerMonitor = 24
|
||||
}
|
||||
|
||||
rows, err := s.db.QueryContext(ctx, `
|
||||
SELECT c.id, c.monitor_id, c.status, c.latency_ms, c.error_message, c.checked_at
|
||||
FROM uptime_checks c
|
||||
INNER JOIN uptime_monitors m ON m.id = c.monitor_id
|
||||
WHERE m.organization_id = ?
|
||||
ORDER BY c.checked_at DESC
|
||||
LIMIT 500
|
||||
`, orgID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
results := map[int64][]models.UptimeCheck{}
|
||||
for rows.Next() {
|
||||
var check models.UptimeCheck
|
||||
if err := rows.Scan(&check.ID, &check.MonitorID, &check.Status, &check.LatencyMS, &check.ErrorMessage, &check.CheckedAt); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(results[check.MonitorID]) >= limitPerMonitor {
|
||||
continue
|
||||
}
|
||||
results[check.MonitorID] = append(results[check.MonitorID], check)
|
||||
}
|
||||
return results, rows.Err()
|
||||
}
|
||||
|
||||
func (s *NodeService) ListRecentUptimeIncidents(ctx context.Context, orgID int64, limit int) ([]models.UptimeIncident, error) {
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
rows, err := s.db.QueryContext(ctx, `
|
||||
SELECT i.id, i.monitor_id, COALESCE(m.name, ''), COALESCE(n.name, ''), COALESCE(g.name, ''),
|
||||
i.error_message, i.started_at, i.ended_at
|
||||
FROM uptime_incidents i
|
||||
INNER JOIN uptime_monitors m ON m.id = i.monitor_id
|
||||
LEFT JOIN nodes n ON n.id = m.node_id
|
||||
LEFT JOIN vm_groups g ON g.id = n.group_id
|
||||
WHERE m.organization_id = ?
|
||||
ORDER BY i.started_at DESC
|
||||
LIMIT ?
|
||||
`, orgID, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var incidents []models.UptimeIncident
|
||||
for rows.Next() {
|
||||
var incident models.UptimeIncident
|
||||
if err := rows.Scan(
|
||||
&incident.ID, &incident.MonitorID, &incident.MonitorName, &incident.NodeName, &incident.GroupName,
|
||||
&incident.ErrorMessage, &incident.StartedAt, &incident.EndedAt,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
incident.DurationSeconds = incidentDurationSeconds(incident.StartedAt, incident.EndedAt)
|
||||
incident.DurationText = humanDuration(time.Duration(incident.DurationSeconds) * time.Second)
|
||||
incidents = append(incidents, incident)
|
||||
}
|
||||
return incidents, rows.Err()
|
||||
}
|
||||
|
||||
func (s *NodeService) UptimePeriodSummary(ctx context.Context, orgID int64, since *time.Time) (models.UptimePeriodSummary, error) {
|
||||
var summary models.UptimePeriodSummary
|
||||
args := []any{orgID}
|
||||
filter := ""
|
||||
if since != nil {
|
||||
filter = " AND c.checked_at >= ?"
|
||||
args = append(args, *since)
|
||||
}
|
||||
|
||||
if err := s.db.QueryRowContext(ctx, `
|
||||
SELECT
|
||||
COUNT(*),
|
||||
COALESCE(SUM(CASE WHEN c.status = 'up' THEN 1 ELSE 0 END), 0),
|
||||
COALESCE(SUM(CASE WHEN c.status = 'down' THEN 1 ELSE 0 END), 0),
|
||||
COALESCE(CAST(AVG(CASE WHEN c.status = 'up' THEN c.latency_ms END) AS INTEGER), 0),
|
||||
COALESCE(SUM(CASE WHEN c.status = 'down' THEN m.interval_seconds ELSE 0 END), 0)
|
||||
FROM uptime_checks c
|
||||
INNER JOIN uptime_monitors m ON m.id = c.monitor_id
|
||||
WHERE m.organization_id = ?`+filter, args...).Scan(
|
||||
&summary.TotalChecks, &summary.UpChecks, &summary.DownChecks, &summary.AvgLatencyMS, &summary.DowntimeSeconds,
|
||||
); err != nil {
|
||||
return summary, err
|
||||
}
|
||||
|
||||
incidentArgs := []any{orgID}
|
||||
incidentFilter := ""
|
||||
if since != nil {
|
||||
incidentFilter = " AND i.started_at >= ?"
|
||||
incidentArgs = append(incidentArgs, *since)
|
||||
}
|
||||
|
||||
var longest sql.NullInt64
|
||||
var avg sql.NullFloat64
|
||||
if err := s.db.QueryRowContext(ctx, `
|
||||
SELECT
|
||||
COUNT(*),
|
||||
MAX(CAST((strftime('%s', COALESCE(i.ended_at, CURRENT_TIMESTAMP)) - strftime('%s', i.started_at)) AS INTEGER)),
|
||||
AVG(CAST((strftime('%s', COALESCE(i.ended_at, CURRENT_TIMESTAMP)) - strftime('%s', i.started_at)) AS INTEGER))
|
||||
FROM uptime_incidents i
|
||||
INNER JOIN uptime_monitors m ON m.id = i.monitor_id
|
||||
WHERE m.organization_id = ?`+incidentFilter, incidentArgs...).Scan(
|
||||
&summary.IncidentCount, &longest, &avg,
|
||||
); err != nil {
|
||||
return summary, err
|
||||
}
|
||||
if longest.Valid {
|
||||
summary.LongestIncidentSeconds = longest.Int64
|
||||
}
|
||||
if avg.Valid {
|
||||
summary.AvgIncidentSeconds = int64(avg.Float64)
|
||||
}
|
||||
return summary, nil
|
||||
}
|
||||
|
||||
func (s *NodeService) RunAllUptimeChecks(ctx context.Context, orgID int64) error {
|
||||
monitors, err := s.ListUptimeMonitors(ctx, orgID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for i := range monitors {
|
||||
if !monitors[i].Enabled {
|
||||
continue
|
||||
}
|
||||
_ = s.RunUptimeCheck(ctx, &monitors[i])
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *NodeService) RunUptimeCheck(ctx context.Context, monitor *models.UptimeMonitor) error {
|
||||
target := strings.TrimSpace(monitor.Target)
|
||||
if target == "" {
|
||||
return fmt.Errorf("empty monitor target")
|
||||
}
|
||||
|
||||
startedAt := time.Now()
|
||||
timeout := 5 * time.Second
|
||||
conn, err := net.DialTimeout("tcp", target, timeout)
|
||||
latencyMS := int64(time.Since(startedAt).Milliseconds())
|
||||
status := "up"
|
||||
errorMessage := ""
|
||||
if err != nil {
|
||||
status = "down"
|
||||
errorMessage = err.Error()
|
||||
} else {
|
||||
_ = conn.Close()
|
||||
}
|
||||
|
||||
if latencyMS < 0 {
|
||||
latencyMS = 0
|
||||
}
|
||||
if _, execErr := s.db.ExecContext(ctx, `
|
||||
INSERT INTO uptime_checks (monitor_id, status, latency_ms, error_message, checked_at)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
`, monitor.ID, status, latencyMS, errorMessage, startedAt); execErr != nil {
|
||||
return execErr
|
||||
}
|
||||
|
||||
if status == "down" && monitor.LastStatus != "down" {
|
||||
if _, execErr := s.db.ExecContext(ctx, `
|
||||
INSERT INTO uptime_incidents (monitor_id, error_message, started_at)
|
||||
VALUES (?, ?, ?)
|
||||
`, monitor.ID, errorMessage, startedAt); execErr != nil {
|
||||
return execErr
|
||||
}
|
||||
}
|
||||
|
||||
if status == "up" && monitor.LastStatus == "down" {
|
||||
if _, execErr := s.db.ExecContext(ctx, `
|
||||
UPDATE uptime_incidents
|
||||
SET ended_at = ?
|
||||
WHERE id = (
|
||||
SELECT id
|
||||
FROM uptime_incidents
|
||||
WHERE monitor_id = ? AND ended_at IS NULL
|
||||
ORDER BY started_at DESC
|
||||
LIMIT 1
|
||||
)
|
||||
`, startedAt, monitor.ID); execErr != nil {
|
||||
return execErr
|
||||
}
|
||||
}
|
||||
|
||||
var upSinceAt any
|
||||
var outageStartedAt any
|
||||
if status == "up" {
|
||||
if monitor.LastStatus == "up" && monitor.UpSinceAt != nil {
|
||||
upSinceAt = *monitor.UpSinceAt
|
||||
} else {
|
||||
upSinceAt = startedAt
|
||||
}
|
||||
outageStartedAt = nil
|
||||
} else {
|
||||
if monitor.LastStatus == "down" && monitor.CurrentOutageStartedAt != nil {
|
||||
outageStartedAt = *monitor.CurrentOutageStartedAt
|
||||
} else {
|
||||
outageStartedAt = startedAt
|
||||
}
|
||||
upSinceAt = nil
|
||||
}
|
||||
|
||||
_, err = s.db.ExecContext(ctx, `
|
||||
UPDATE uptime_monitors
|
||||
SET last_status = ?, last_latency_ms = ?, last_checked_at = ?, last_error = ?,
|
||||
up_since_at = ?, current_outage_started_at = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?
|
||||
`, status, latencyMS, startedAt, errorMessage, upSinceAt, outageStartedAt, monitor.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
monitor.LastStatus = status
|
||||
monitor.LastLatencyMS = latencyMS
|
||||
monitor.LastCheckedAt = &startedAt
|
||||
monitor.LastError = errorMessage
|
||||
if status == "up" {
|
||||
if upTime, ok := upSinceAt.(time.Time); ok {
|
||||
monitor.UpSinceAt = &upTime
|
||||
}
|
||||
monitor.CurrentOutageStartedAt = nil
|
||||
} else {
|
||||
if downTime, ok := outageStartedAt.(time.Time); ok {
|
||||
monitor.CurrentOutageStartedAt = &downTime
|
||||
}
|
||||
monitor.UpSinceAt = nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func sendMagicPacket(macAddress string) error {
|
||||
hw, err := net.ParseMAC(macAddress)
|
||||
if err != nil {
|
||||
@@ -556,6 +934,10 @@ func NewSchedulerService(database *sql.DB, nodeService *NodeService) *SchedulerS
|
||||
}
|
||||
|
||||
func (s *SchedulerService) Start(ctx context.Context, orgID int64, refreshSpec string) error {
|
||||
if err := s.nodeService.EnsureUptimeMonitors(ctx, orgID); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err := s.cron.AddFunc(refreshSpec, func() {
|
||||
nodes, err := s.nodeService.ListNodes(ctx, orgID)
|
||||
if err != nil {
|
||||
@@ -571,6 +953,13 @@ func (s *SchedulerService) Start(ctx context.Context, orgID int64, refreshSpec s
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err := s.cron.AddFunc("@every 1m", func() {
|
||||
_ = s.nodeService.EnsureUptimeMonitors(context.Background(), orgID)
|
||||
_ = s.nodeService.RunAllUptimeChecks(context.Background(), orgID)
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
jobs, err := s.nodeService.ListAutomations(ctx, orgID)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -611,10 +1000,16 @@ func (s *SchedulerService) runAutomation(ctx context.Context, job models.Automat
|
||||
}
|
||||
finishedAt := time.Now()
|
||||
|
||||
_, _ = s.db.ExecContext(ctx, `
|
||||
INSERT INTO command_runs (job_id, node_id, action, status, output, started_at, finished_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
`, job.ID, node.ID, job.Name, status, output, startedAt, finishedAt)
|
||||
s.nodeService.logCommandRun(ctx, commandRunParams{
|
||||
JobID: &job.ID,
|
||||
NodeID: node.ID,
|
||||
Action: job.Name,
|
||||
CommandText: sanitizeCommand(job.Command),
|
||||
Status: status,
|
||||
Output: output,
|
||||
StartedAt: &startedAt,
|
||||
FinishedAt: &finishedAt,
|
||||
})
|
||||
|
||||
lastRunAt = finishedAt
|
||||
}
|
||||
@@ -673,3 +1068,95 @@ func formatDuration(startedAt time.Time, finishedAt *time.Time) string {
|
||||
}
|
||||
return fmt.Sprintf("%dm %ds", minutes, seconds)
|
||||
}
|
||||
|
||||
type commandRunParams struct {
|
||||
JobID *int64
|
||||
NodeID int64
|
||||
Action string
|
||||
CommandText string
|
||||
Status string
|
||||
Output string
|
||||
TriggeredBy *int64
|
||||
StartedAt *time.Time
|
||||
FinishedAt *time.Time
|
||||
}
|
||||
|
||||
func (s *NodeService) logCommandRun(ctx context.Context, params commandRunParams) {
|
||||
startedAt := time.Now()
|
||||
if params.StartedAt != nil {
|
||||
startedAt = *params.StartedAt
|
||||
}
|
||||
finishedAt := params.FinishedAt
|
||||
if finishedAt == nil {
|
||||
value := time.Now()
|
||||
finishedAt = &value
|
||||
}
|
||||
|
||||
_, _ = s.db.ExecContext(ctx, `
|
||||
INSERT INTO command_runs (job_id, node_id, action, command_text, status, output, triggered_by, started_at, finished_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`, params.JobID, params.NodeID, params.Action, params.CommandText, params.Status, params.Output, params.TriggeredBy, startedAt, finishedAt)
|
||||
}
|
||||
|
||||
func sanitizeCommand(command string) string {
|
||||
trimmed := strings.TrimSpace(command)
|
||||
if trimmed == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
patterns := []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)(--password(?:=|\s+))(\S+)`),
|
||||
regexp.MustCompile(`(?i)(--token(?:=|\s+))(\S+)`),
|
||||
regexp.MustCompile(`(?i)(--secret(?:=|\s+))(\S+)`),
|
||||
regexp.MustCompile(`(?i)\b(password|passwd|token|secret|api[_-]?key)\s*=\s*(['"]?)[^'"\s]+(['"]?)`),
|
||||
}
|
||||
|
||||
sanitized := trimmed
|
||||
for _, pattern := range patterns {
|
||||
sanitized = pattern.ReplaceAllString(sanitized, `$1[REDACTED]`)
|
||||
}
|
||||
return sanitized
|
||||
}
|
||||
|
||||
func incidentDurationSeconds(startedAt time.Time, endedAt *time.Time) int64 {
|
||||
end := time.Now()
|
||||
if endedAt != nil {
|
||||
end = *endedAt
|
||||
}
|
||||
if end.Before(startedAt) {
|
||||
return 0
|
||||
}
|
||||
return int64(end.Sub(startedAt).Seconds())
|
||||
}
|
||||
|
||||
func humanDuration(duration time.Duration) string {
|
||||
if duration < 0 {
|
||||
duration = 0
|
||||
}
|
||||
duration = duration.Round(time.Second)
|
||||
days := int(duration / (24 * time.Hour))
|
||||
duration -= time.Duration(days) * 24 * time.Hour
|
||||
hours := int(duration / time.Hour)
|
||||
duration -= time.Duration(hours) * time.Hour
|
||||
minutes := int(duration / time.Minute)
|
||||
duration -= time.Duration(minutes) * time.Minute
|
||||
seconds := int(duration / time.Second)
|
||||
|
||||
parts := make([]string, 0, 4)
|
||||
if days > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%dd", days))
|
||||
}
|
||||
if hours > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%dh", hours))
|
||||
}
|
||||
if minutes > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%dm", minutes))
|
||||
}
|
||||
if seconds > 0 || len(parts) == 0 {
|
||||
parts = append(parts, fmt.Sprintf("%ds", seconds))
|
||||
}
|
||||
if len(parts) > 2 {
|
||||
parts = parts[:2]
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
<div>
|
||||
<div class="text-uppercase small fw-semibold text-primary mb-2">Themes</div>
|
||||
<h1 class="display-6 fw-bold mb-2">Appearance</h1>
|
||||
<p class="text-body-secondary mb-0">Five defaults. Colored themes work in dark or light.</p>
|
||||
<p class="text-body-secondary mb-0">Dark or light only.</p>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
@@ -14,18 +14,9 @@
|
||||
<div class="card-body p-4">
|
||||
<h2 class="h4 mb-3">Theme Presets</h2>
|
||||
<form method="post" action="/settings/theme">
|
||||
<div class="mb-4">
|
||||
<label class="form-label d-block">Mode</label>
|
||||
<div class="d-flex gap-2 flex-wrap">
|
||||
<input type="radio" class="btn-check" name="mode" id="mode-dark" value="dark" {{if eq $data.CurrentMode "dark"}}checked{{end}}>
|
||||
<label class="btn btn-outline-secondary" for="mode-dark">Dark</label>
|
||||
<input type="radio" class="btn-check" name="mode" id="mode-light" value="light" {{if eq $data.CurrentMode "light"}}checked{{end}}>
|
||||
<label class="btn btn-outline-secondary" for="mode-light">Light</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<input type="hidden" name="mode" value="{{$data.CurrentTheme}}">
|
||||
<div class="row g-3 mb-4">
|
||||
<div class="col-12 col-md-6 col-xxl-4">
|
||||
<div class="col-12 col-md-6">
|
||||
<input type="radio" class="btn-check" name="theme" id="theme-dark" value="dark" {{if eq $data.CurrentTheme "dark"}}checked{{end}}>
|
||||
<label class="theme-card d-block h-100" for="theme-dark">
|
||||
<span class="theme-swatch swatch-dark"></span>
|
||||
@@ -33,7 +24,7 @@
|
||||
<small>Neutral dark base</small>
|
||||
</label>
|
||||
</div>
|
||||
<div class="col-12 col-md-6 col-xxl-4">
|
||||
<div class="col-12 col-md-6">
|
||||
<input type="radio" class="btn-check" name="theme" id="theme-light" value="light" {{if eq $data.CurrentTheme "light"}}checked{{end}}>
|
||||
<label class="theme-card d-block h-100" for="theme-light">
|
||||
<span class="theme-swatch swatch-light"></span>
|
||||
@@ -41,30 +32,6 @@
|
||||
<small>Neutral light base</small>
|
||||
</label>
|
||||
</div>
|
||||
<div class="col-12 col-md-6 col-xxl-4">
|
||||
<input type="radio" class="btn-check" name="theme" id="theme-green" value="green" {{if eq $data.CurrentTheme "green"}}checked{{end}}>
|
||||
<label class="theme-card d-block h-100" for="theme-green">
|
||||
<span class="theme-swatch swatch-green"></span>
|
||||
<strong>Green</strong>
|
||||
<small>Computed green accents</small>
|
||||
</label>
|
||||
</div>
|
||||
<div class="col-12 col-md-6 col-xxl-4">
|
||||
<input type="radio" class="btn-check" name="theme" id="theme-red" value="red" {{if eq $data.CurrentTheme "red"}}checked{{end}}>
|
||||
<label class="theme-card d-block h-100" for="theme-red">
|
||||
<span class="theme-swatch swatch-red"></span>
|
||||
<strong>Red</strong>
|
||||
<small>Computed red accents</small>
|
||||
</label>
|
||||
</div>
|
||||
<div class="col-12 col-md-6 col-xxl-4">
|
||||
<input type="radio" class="btn-check" name="theme" id="theme-blue" value="blue" {{if eq $data.CurrentTheme "blue"}}checked{{end}}>
|
||||
<label class="theme-card d-block h-100" for="theme-blue">
|
||||
<span class="theme-swatch swatch-blue"></span>
|
||||
<strong>Blue</strong>
|
||||
<small>Computed blue accents</small>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<button type="submit" class="btn btn-primary">Save Theme</button>
|
||||
@@ -90,4 +57,40 @@
|
||||
</article>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="mt-4">
|
||||
<article class="card border-0 shadow-sm">
|
||||
<div class="card-body p-4">
|
||||
<h2 class="h4 mb-3">Command History</h2>
|
||||
<div class="table-responsive">
|
||||
<table class="table align-middle mb-0">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Group/VMid</th>
|
||||
<th>Time</th>
|
||||
<th>Status</th>
|
||||
<th>Command</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{{if $data.Runs}}
|
||||
{{range $data.Runs}}
|
||||
<tr>
|
||||
<td class="text-nowrap">{{if .GroupName}}{{.GroupName}} / {{end}}{{.NodeName}}</td>
|
||||
<td class="text-nowrap">{{.StartedAt.Format "2006-01-02 15:04:05"}}</td>
|
||||
<td><span class="badge {{if eq .Status "completed"}}text-bg-success{{else}}text-bg-danger{{end}}">{{.Status}}</span></td>
|
||||
<td><code>{{if .CommandText}}{{.CommandText}}{{else}}{{.Action}}{{end}}</code></td>
|
||||
</tr>
|
||||
{{end}}
|
||||
{{else}}
|
||||
<tr>
|
||||
<td colspan="4" class="text-body-secondary">No command history yet.</td>
|
||||
</tr>
|
||||
{{end}}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
</section>
|
||||
{{end}}
|
||||
|
||||
182
internal/views/pages/uptime.gohtml
Normal file
182
internal/views/pages/uptime.gohtml
Normal file
@@ -0,0 +1,182 @@
|
||||
{{define "content"}}
|
||||
{{$data := .Content}}
|
||||
<section class="row g-3 mb-4">
|
||||
<div class="col-12 col-md-6 col-xxl-3">
|
||||
<article class="card border-0 shadow-sm uptime-summary-card h-100">
|
||||
<div class="card-body">
|
||||
<div class="uptime-summary-label">Monitors</div>
|
||||
<div class="uptime-summary-value">{{$data.Summary.TotalMonitors}}</div>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
<div class="col-12 col-md-6 col-xxl-3">
|
||||
<article class="card border-0 shadow-sm uptime-summary-card h-100">
|
||||
<div class="card-body">
|
||||
<div class="uptime-summary-label">Up</div>
|
||||
<div class="uptime-summary-value">{{$data.Summary.UpMonitors}}</div>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
<div class="col-12 col-md-6 col-xxl-3">
|
||||
<article class="card border-0 shadow-sm uptime-summary-card h-100">
|
||||
<div class="card-body">
|
||||
<div class="uptime-summary-label">Down</div>
|
||||
<div class="uptime-summary-value">{{$data.Summary.DownMonitors}}</div>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
<div class="col-12 col-md-6 col-xxl-3">
|
||||
<article class="card border-0 shadow-sm uptime-summary-card h-100">
|
||||
<div class="card-body">
|
||||
<div class="uptime-summary-label">Avg latency</div>
|
||||
<div class="uptime-summary-value">{{if $data.Summary.AvgLatencyMS}}{{$data.Summary.AvgLatencyMS}}ms{{else}}-{{end}}</div>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="row g-4 mb-4">
|
||||
<div class="col-12 col-xxl-8">
|
||||
<article class="card border-0 shadow-sm">
|
||||
<div class="card-body p-4">
|
||||
<div class="d-flex align-items-center justify-content-between gap-3 mb-4">
|
||||
<h2 class="h4 mb-0">Monitors</h2>
|
||||
<span class="text-body-secondary small">SSH endpoint checks</span>
|
||||
</div>
|
||||
<div class="row g-3">
|
||||
{{if $data.Monitors}}
|
||||
{{range $data.Monitors}}
|
||||
<div class="col-12 col-xl-6">
|
||||
<article class="uptime-monitor-card {{if eq .Monitor.LastStatus "down"}}is-down{{else if eq .Monitor.LastStatus "up"}}is-up{{else}}is-pending{{end}}">
|
||||
<div class="uptime-monitor-head">
|
||||
<div>
|
||||
<div class="fw-semibold text-body-emphasis">{{.Monitor.Name}}</div>
|
||||
<div class="small text-body-secondary">{{.Monitor.Target}}{{if .Monitor.GroupName}} · {{.Monitor.GroupName}}{{end}}</div>
|
||||
</div>
|
||||
<span class="uptime-monitor-badge {{if eq .Monitor.LastStatus "down"}}is-down{{else if eq .Monitor.LastStatus "up"}}is-up{{else}}is-pending{{end}}">
|
||||
{{if eq .Monitor.LastStatus "down"}}Down{{else if eq .Monitor.LastStatus "up"}}Up{{else}}Pending{{end}}
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<div class="uptime-monitor-stats">
|
||||
<div class="uptime-monitor-stat">
|
||||
<span>Availability</span>
|
||||
<strong>{{.AvailabilityText}}</strong>
|
||||
</div>
|
||||
<div class="uptime-monitor-stat">
|
||||
<span>Latency</span>
|
||||
<strong>{{if .Monitor.LastLatencyMS}}{{.Monitor.LastLatencyMS}}ms{{else}}-{{end}}</strong>
|
||||
</div>
|
||||
<div class="uptime-monitor-stat">
|
||||
<span>Checked</span>
|
||||
<strong>{{.LastCheckedText}}</strong>
|
||||
</div>
|
||||
<div class="uptime-monitor-stat">
|
||||
<span>Interval</span>
|
||||
<strong>{{.IntervalText}}</strong>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="uptime-check-strip" aria-hidden="true">
|
||||
{{if .RecentChecks}}
|
||||
{{range .RecentChecks}}
|
||||
<span class="uptime-check-pill {{if eq .Status "down"}}is-down{{else if eq .Status "up"}}is-up{{else}}is-pending{{end}}" title="{{.Status}} · {{.CheckedAt.Format "2006-01-02 15:04:05"}}"></span>
|
||||
{{end}}
|
||||
{{else}}
|
||||
<span class="small text-body-secondary">No checks yet.</span>
|
||||
{{end}}
|
||||
</div>
|
||||
|
||||
<div class="uptime-monitor-foot">
|
||||
<span>{{.StateDurationText}}</span>
|
||||
<span class="text-truncate">{{if .Monitor.LastError}}{{.Monitor.LastError}}{{else}}{{.Monitor.NodeName}}{{end}}</span>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
{{end}}
|
||||
{{else}}
|
||||
<div class="col-12">
|
||||
<div class="text-body-secondary">No monitors yet. Add a VM to start tracking uptime.</div>
|
||||
</div>
|
||||
{{end}}
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
|
||||
<div class="col-12 col-xxl-4">
|
||||
<article class="card border-0 shadow-sm h-100">
|
||||
<div class="card-body p-4">
|
||||
<h2 class="h4 mb-4">Availability</h2>
|
||||
<div class="table-responsive">
|
||||
<table class="table align-middle mb-0 uptime-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Period</th>
|
||||
<th>Availability</th>
|
||||
<th>Downtime</th>
|
||||
<th>Incidents</th>
|
||||
<th>Longest</th>
|
||||
<th>Avg</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{{range $data.Periods}}
|
||||
<tr>
|
||||
<td class="text-nowrap">{{.Label}}</td>
|
||||
<td class="text-nowrap">{{.AvailabilityText}}</td>
|
||||
<td class="text-nowrap">{{.DowntimeText}}</td>
|
||||
<td>{{.Incidents}}</td>
|
||||
<td class="text-nowrap">{{.LongestText}}</td>
|
||||
<td class="text-nowrap">{{.AverageText}}</td>
|
||||
</tr>
|
||||
{{end}}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<article class="card border-0 shadow-sm">
|
||||
<div class="card-body p-4">
|
||||
<div class="d-flex align-items-center justify-content-between gap-3 mb-4">
|
||||
<h2 class="h4 mb-0">Incidents</h2>
|
||||
<span class="text-body-secondary small">{{len $data.Incidents}} recent</span>
|
||||
</div>
|
||||
<div class="table-responsive">
|
||||
<table class="table align-middle mb-0 uptime-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Monitor</th>
|
||||
<th>Started</th>
|
||||
<th>Ended</th>
|
||||
<th>Duration</th>
|
||||
<th>Error</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{{if $data.Incidents}}
|
||||
{{range $data.Incidents}}
|
||||
<tr>
|
||||
<td class="text-nowrap">{{.MonitorName}}</td>
|
||||
<td class="text-nowrap">{{.StartedAt.Format "2006-01-02 15:04:05"}}</td>
|
||||
<td class="text-nowrap">{{if .EndedAt}}{{.EndedAt.Format "2006-01-02 15:04:05"}}{{else}}Active{{end}}</td>
|
||||
<td class="text-nowrap">{{.DurationText}}</td>
|
||||
<td>{{if .ErrorMessage}}{{.ErrorMessage}}{{else}}Connection failed{{end}}</td>
|
||||
</tr>
|
||||
{{end}}
|
||||
{{else}}
|
||||
<tr>
|
||||
<td colspan="5" class="text-body-secondary">No incidents recorded yet.</td>
|
||||
</tr>
|
||||
{{end}}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
</section>
|
||||
{{end}}
|
||||
Reference in New Issue
Block a user