diff --git a/README.md b/README.md index 3d8879ad..cef05eaf 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ For more details, see [Usage](#usage) - [Configuring Twilio alerts](#configuring-twilio-alerts) - [Configuring custom alerts](#configuring-custom-alerts) - [Setting a default alert](#setting-a-default-alert) + - [Maintenance](#maintenance) - [Deployment](#deployment) - [Docker](#docker) - [Helm Chart](#helm-chart) @@ -736,6 +737,37 @@ services: - type: pagerduty ``` +### Maintenance +If you have maintenance windows, you may not want to be annoyed by alerts. +To do that, you'll have to use the maintenance configuration: + +| Parameter | Description | Default | +|:----------------------- |:----------------------------------------------------------------------------- |:--------------- | +| `maintenance.enabled` | Whether the maintenance period is enabled | `true` | +| `maintenance.start` | Time at which the maintenance window starts in `hh:mm` format (e.g. `23:00`) | Required `""` | +| `maintenance.duration` | Duration of the maintenance window (e.g. `1h`, `30m`) | Required `""` | +| `maintenance.every` | Days on which the maintenance period applies (e.g. `[Monday, Thursday]`).
If left empty, the maintenance window applies every day | `[]` | +**Note that the maintenance configuration uses UTC.** + + +Here's an example: +```yaml +maintenance: + start: 23:00 + duration: 1h + every: [Monday, Thursday] +``` +Note that you can also specify each day on separate lines: +```yaml +maintenance: + start: 23:00 + duration: 1h + every: + - Monday + - Thursday +``` + + ## Deployment Many examples can be found in the [examples](examples) folder, but this section will focus on the most popular ways of deploying Gatus. diff --git a/config/config.go b/config/config.go index 90945e39..5df80299 100644 --- a/config/config.go +++ b/config/config.go @@ -10,6 +10,7 @@ import ( "github.com/TwinProduction/gatus/alerting" "github.com/TwinProduction/gatus/alerting/alert" "github.com/TwinProduction/gatus/alerting/provider" + "github.com/TwinProduction/gatus/config/maintenance" "github.com/TwinProduction/gatus/core" "github.com/TwinProduction/gatus/security" "github.com/TwinProduction/gatus/storage" @@ -82,6 +83,9 @@ type Config struct { // UI is the configuration for the UI UI *UIConfig `yaml:"ui"` + // Maintenance is the configuration for creating a maintenance window in which no alerts are sent + Maintenance *maintenance.Config `yaml:"maintenance"` + filePath string // path to the file from which config was loaded from lastFileModTime time.Time // last modification time } @@ -172,6 +176,9 @@ func parseAndValidateConfigBytes(yamlBytes []byte) (config *Config, err error) { if err := validateUIConfig(config); err != nil { return nil, err } + if err := validateMaintenanceConfig(config); err != nil { + return nil, err + } if err := validateStorageConfig(config); err != nil { return nil, err } @@ -201,6 +208,17 @@ func validateStorageConfig(config *Config) error { return nil } +func validateMaintenanceConfig(config *Config) error { + if config.Maintenance == nil { + config.Maintenance = maintenance.GetDefaultConfig() + } else { + if err := config.Maintenance.ValidateAndSetDefaults(); err != nil { + return err + } + } + return nil +} + func validateUIConfig(config *Config) error { if config.UI == nil { config.UI = GetDefaultUIConfig() diff --git a/config/config_test.go b/config/config_test.go index c5147adf..14ca81ba 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -43,6 +43,11 @@ func TestParseAndValidateConfigBytes(t *testing.T) { config, err := parseAndValidateConfigBytes([]byte(fmt.Sprintf(` storage: file: %s +maintenance: + enabled: true + start: 00:00 + duration: 4h + every: [Monday, Thursday] ui: title: Test services: @@ -79,6 +84,9 @@ services: if config.UI == nil || config.UI.Title != "Test" { t.Error("Expected Config.UI.Title to be Test") } + if mc := config.Maintenance; mc == nil || mc.Start != "00:00" || !mc.IsEnabled() || mc.Duration != 4*time.Hour || len(mc.Every) != 2 { + t.Error("Expected Config.Maintenance to be configured properly") + } if len(config.Services) != 3 { t.Error("Should have returned two services") } diff --git a/config/maintenance/maintenance.go b/config/maintenance/maintenance.go new file mode 100644 index 00000000..29da9294 --- /dev/null +++ b/config/maintenance/maintenance.go @@ -0,0 +1,133 @@ +package maintenance + +import ( + "errors" + "fmt" + "sort" + "strconv" + "strings" + "time" +) + +var ( + errInvalidMaintenanceStartFormat = errors.New("invalid maintenance start format: must be hh:mm, between 00:00 and 23:59 inclusively (e.g. 23:00)") + errInvalidMaintenanceDuration = errors.New("invalid maintenance duration: must be bigger than 0 (e.g. 30m)") + errInvalidDayName = fmt.Errorf("invalid value specified for 'on'. supported values are %s", longDayNames) + + longDayNames = []string{ + "Sunday", + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + } +) + +// Config allows for the configuration of a maintenance period. +// During this maintenance period, no alerts will be sent. +// +// Uses UTC. +type Config struct { + Enabled *bool `yaml:"enabled"` // Whether the maintenance period is enabled. Enabled by default if nil. + Start string `yaml:"start"` // Time at which the maintenance period starts (e.g. 23:00) + Duration time.Duration `yaml:"duration"` // Duration of the maintenance period (e.g. 4h) + + // Every is a list of days of the week during which maintenance period applies. + // See longDayNames for list of valid values. + // Every day if empty. + Every []string `yaml:"every"` + + durationToStartFromMidnight time.Duration + timeLocation *time.Location +} + +func GetDefaultConfig() *Config { + defaultValue := false + return &Config{ + Enabled: &defaultValue, + } +} + +// IsEnabled returns whether maintenance is enabled or not +func (c Config) IsEnabled() bool { + if c.Enabled == nil { + return true + } + return *c.Enabled +} + +// ValidateAndSetDefaults validates the maintenance configuration and sets the default values if necessary. +// +// Must be called once in the application's lifecycle before IsUnderMaintenance is called, since it +// also sets durationToStartFromMidnight. +func (c *Config) ValidateAndSetDefaults() error { + if c == nil || !c.IsEnabled() { + // Don't waste time validating if maintenance is not enabled. + return nil + } + for _, day := range c.Every { + isDayValid := false + for _, longDayName := range longDayNames { + if day == longDayName { + isDayValid = true + break + } + } + if !isDayValid { + return errInvalidDayName + } + } + var err error + c.durationToStartFromMidnight, err = hhmmToDuration(c.Start) + if err != nil { + return err + } + if c.Duration <= 0 || c.Duration >= 24*time.Hour { + return errInvalidMaintenanceDuration + } + return nil +} + +// IsUnderMaintenance checks whether the services that Gatus monitors are within the configured maintenance window +func (c Config) IsUnderMaintenance() bool { + if !c.IsEnabled() { + return false + } + now := time.Now().UTC() + dayWhereMaintenancePeriodWouldStart := now.Add(-c.Duration).Truncate(24 * time.Hour) + hasMaintenanceEveryDay := len(c.Every) == 0 + hasMaintenancePeriodScheduledForThatWeekday := sort.SearchStrings(c.Every, dayWhereMaintenancePeriodWouldStart.Weekday().String()) != len(c.Every) + if !hasMaintenanceEveryDay && !hasMaintenancePeriodScheduledForThatWeekday { + // The day when the maintenance period would start is not scheduled + // to have any maintenance, so we can just return false. + return false + } + startOfMaintenancePeriod := dayWhereMaintenancePeriodWouldStart.Add(c.durationToStartFromMidnight) + endOfMaintenancePeriod := startOfMaintenancePeriod.Add(c.Duration) + return now.After(startOfMaintenancePeriod) && now.Before(endOfMaintenancePeriod) +} + +func hhmmToDuration(s string) (time.Duration, error) { + if len(s) != 5 { + return 0, errInvalidMaintenanceStartFormat + } + var hours, minutes int + var err error + if hours, err = extractNumericalValueFromPotentiallyZeroPaddedString(s[:2]); err != nil { + return 0, err + } + if minutes, err = extractNumericalValueFromPotentiallyZeroPaddedString(s[3:5]); err != nil { + return 0, err + } + duration := (time.Duration(hours) * time.Hour) + (time.Duration(minutes) * time.Minute) + if hours < 0 || hours > 23 || minutes < 0 || minutes > 59 || duration < 0 || duration >= 24*time.Hour { + return 0, errInvalidMaintenanceStartFormat + } + return duration, nil +} + +func extractNumericalValueFromPotentiallyZeroPaddedString(s string) (int, error) { + return strconv.Atoi(strings.TrimPrefix(s, "0")) +} diff --git a/config/maintenance/maintenance_test.go b/config/maintenance/maintenance_test.go new file mode 100644 index 00000000..8b087996 --- /dev/null +++ b/config/maintenance/maintenance_test.go @@ -0,0 +1,193 @@ +package maintenance + +import ( + "errors" + "fmt" + "strconv" + "testing" + "time" +) + +func TestGetDefaultConfig(t *testing.T) { + if *GetDefaultConfig().Enabled { + t.Fatal("expected default config to be disabled by default") + } +} + +func TestConfig_Validate(t *testing.T) { + yes, no := true, false + scenarios := []struct { + name string + cfg *Config + expectedError error + }{ + { + name: "nil", + cfg: nil, + expectedError: nil, + }, + { + name: "disabled", + cfg: &Config{ + Enabled: &no, + }, + expectedError: nil, + }, + { + name: "invalid-day", + cfg: &Config{ + Every: []string{"invalid-day"}, + }, + expectedError: errInvalidDayName, + }, + { + name: "invalid-day", + cfg: &Config{ + Every: []string{"invalid-day"}, + }, + expectedError: errInvalidDayName, + }, + { + name: "invalid-start-format", + cfg: &Config{ + Start: "0000", + }, + expectedError: errInvalidMaintenanceStartFormat, + }, + { + name: "invalid-start-hours", + cfg: &Config{ + Start: "25:00", + }, + expectedError: errInvalidMaintenanceStartFormat, + }, + { + name: "invalid-start-minutes", + cfg: &Config{ + Start: "0:61", + }, + expectedError: errInvalidMaintenanceStartFormat, + }, + { + name: "invalid-start-minutes-non-numerical", + cfg: &Config{ + Start: "00:zz", + }, + expectedError: strconv.ErrSyntax, + }, + { + name: "invalid-start-hours-non-numerical", + cfg: &Config{ + Start: "zz:00", + }, + expectedError: strconv.ErrSyntax, + }, + { + name: "invalid-duration", + cfg: &Config{ + Start: "23:00", + Duration: 0, + }, + expectedError: errInvalidMaintenanceDuration, + }, + { + name: "every-day-at-2300", + cfg: &Config{ + Start: "23:00", + Duration: time.Hour, + }, + expectedError: nil, + }, + { + name: "every-monday-at-0000", + cfg: &Config{ + Start: "00:00", + Duration: 30 * time.Minute, + Every: []string{"Monday"}, + }, + expectedError: nil, + }, + { + name: "every-friday-and-sunday-at-0000-explicitly-enabled", + cfg: &Config{ + Enabled: &yes, + Start: "08:00", + Duration: 8 * time.Hour, + Every: []string{"Friday", "Sunday"}, + }, + expectedError: nil, + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + err := scenario.cfg.ValidateAndSetDefaults() + if !errors.Is(err, scenario.expectedError) { + t.Errorf("expected %v, got %v", scenario.expectedError, err) + } + }) + } +} + +func TestConfig_IsUnderMaintenance(t *testing.T) { + yes, no := true, false + now := time.Now().UTC() + scenarios := []struct { + name string + cfg *Config + expected bool + }{ + { + name: "disabled", + cfg: &Config{ + Enabled: &no, + }, + expected: false, + }, + { + name: "under-maintenance-explicitly-enabled", + cfg: &Config{ + Enabled: &yes, + Start: fmt.Sprintf("%02d:00", now.Hour()), + Duration: 2 * time.Hour, + }, + expected: true, + }, + { + name: "under-maintenance", + cfg: &Config{ + Start: fmt.Sprintf("%02d:00", now.Hour()), + Duration: 2 * time.Hour, + }, + expected: true, + }, + { + name: "not-under-maintenance", + cfg: &Config{ + Start: fmt.Sprintf("%02d:00", now.Add(-5*time.Hour).Hour()), + Duration: time.Hour, + }, + expected: false, + }, + { + name: "not-under-maintenance-today", + cfg: &Config{ + Start: fmt.Sprintf("%02d:00", now.Hour()), + Duration: time.Hour, + Every: []string{now.Add(48 * time.Hour).Weekday().String()}, + }, + expected: false, + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + if scenario.cfg.ValidateAndSetDefaults() != nil { + t.Fatal("validation shouldn't have returned an error") + } + isUnderMaintenance := scenario.cfg.IsUnderMaintenance() + if isUnderMaintenance != scenario.expected { + t.Errorf("expected %v, got %v", scenario.expected, isUnderMaintenance) + t.Logf("start=%v; duration=%v; now=%v", scenario.cfg.Start, scenario.cfg.Duration, time.Now().UTC()) + } + }) + } +} diff --git a/watchdog/watchdog.go b/watchdog/watchdog.go index 4689cfef..14f9bfe8 100644 --- a/watchdog/watchdog.go +++ b/watchdog/watchdog.go @@ -8,6 +8,7 @@ import ( "github.com/TwinProduction/gatus/alerting" "github.com/TwinProduction/gatus/config" + "github.com/TwinProduction/gatus/config/maintenance" "github.com/TwinProduction/gatus/core" "github.com/TwinProduction/gatus/metric" "github.com/TwinProduction/gatus/storage" @@ -27,17 +28,17 @@ func Monitor(cfg *config.Config) { ctx, cancelFunc = context.WithCancel(context.Background()) for _, service := range cfg.Services { if service.IsEnabled() { - // To prevent multiple requests from running at the same time, we'll wait for a little bit before each iteration + // To prevent multiple requests from running at the same time, we'll wait for a little before each iteration time.Sleep(1111 * time.Millisecond) - go monitor(service, cfg.Alerting, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx) + go monitor(service, cfg.Alerting, cfg.Maintenance, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx) } } } // monitor monitors a single service in a loop -func monitor(service *core.Service, alertingConfig *alerting.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) { +func monitor(service *core.Service, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) { // Run it immediately on start - execute(service, alertingConfig, disableMonitoringLock, enabledMetrics, debug) + execute(service, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug) // Loop for the next executions for { select { @@ -45,12 +46,12 @@ func monitor(service *core.Service, alertingConfig *alerting.Config, disableMoni log.Printf("[watchdog][monitor] Canceling current execution of group=%s; service=%s", service.Group, service.Name) return case <-time.After(service.Interval): - execute(service, alertingConfig, disableMonitoringLock, enabledMetrics, debug) + execute(service, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug) } } } -func execute(service *core.Service, alertingConfig *alerting.Config, disableMonitoringLock, enabledMetrics, debug bool) { +func execute(service *core.Service, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool) { if !disableMonitoringLock { // By placing the lock here, we prevent multiple services from being monitored at the exact same time, which // could cause performance issues and return inaccurate results @@ -72,7 +73,11 @@ func execute(service *core.Service, alertingConfig *alerting.Config, disableMoni len(result.Errors), result.Duration.Round(time.Millisecond), ) - HandleAlerting(service, result, alertingConfig, debug) + if !maintenanceConfig.IsUnderMaintenance() { + HandleAlerting(service, result, alertingConfig, debug) + } else if debug { + log.Println("[watchdog][execute] Not handling alerting because currently in the maintenance window") + } if debug { log.Printf("[watchdog][execute] Waiting for interval=%s before monitoring group=%s service=%s again", service.Interval, service.Group, service.Name) } @@ -83,7 +88,9 @@ func execute(service *core.Service, alertingConfig *alerting.Config, disableMoni // UpdateServiceStatuses updates the slice of service statuses func UpdateServiceStatuses(service *core.Service, result *core.Result) { - storage.Get().Insert(service, result) + if err := storage.Get().Insert(service, result); err != nil { + log.Println("[watchdog][UpdateServiceStatuses] Failed to insert data in storage:", err.Error()) + } } // Shutdown stops monitoring all services