Merge pull request #10 from TwinProduction/notify-on-resolved

Support sending an alert when an unhealthy service becomes healthy again
This commit is contained in:
Christian C 2020-09-04 22:23:47 -04:00 committed by GitHub
commit 4df1baf432
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 259 additions and 117 deletions

BIN
.github/assets/slack-alerts.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

View File

@ -24,7 +24,7 @@ core applications: https://status.twinnation.org/
- [Sending a GraphQL request](#sending-a-graphql-request) - [Sending a GraphQL request](#sending-a-graphql-request)
- [Configuring Slack alerts](#configuring-slack-alerts) - [Configuring Slack alerts](#configuring-slack-alerts)
- [Configuring Twilio alerts](#configuring-twilio-alerts) - [Configuring Twilio alerts](#configuring-twilio-alerts)
- [Configuring custom alert](#configuring-custom-alerts) - [Configuring custom alerts](#configuring-custom-alerts)
## Features ## Features
@ -67,13 +67,14 @@ This example would look like this:
![Simple example](.github/assets/example.png) ![Simple example](.github/assets/example.png)
Note that you can also add environment variables in the your configuration file (i.e. `$DOMAIN`, `${DOMAIN}`) Note that you can also add environment variables in the configuration file (i.e. `$DOMAIN`, `${DOMAIN}`)
### Configuration ### Configuration
| Parameter | Description | Default | | Parameter | Description | Default |
| --------------------------------- | --------------------------------------------------------------- | -------------- | | -------------------------------------- | --------------------------------------------------------------- | -------------- |
| `debug` | Whether to enable debug logs | `false` |
| `metrics` | Whether to expose metrics at /metrics | `false` | | `metrics` | Whether to expose metrics at /metrics | `false` |
| `services` | List of services to monitor | Required `[]` | | `services` | List of services to monitor | Required `[]` |
| `services[].name` | Name of the service. Can be anything. | Required `""` | | `services[].name` | Name of the service. Can be anything. | Required `""` |
@ -88,6 +89,7 @@ Note that you can also add environment variables in the your configuration file
| `services[].alerts[].enabled` | Whether to enable the alert | `false` | | `services[].alerts[].enabled` | Whether to enable the alert | `false` |
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` | | `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` | | `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` |
| `alerting` | Configuration for alerting | `{}` | | `alerting` | Configuration for alerting | `{}` |
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` | | `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` | | `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
@ -121,7 +123,7 @@ Here are some examples of conditions you can use:
## Docker ## Docker
Building the Docker image is done as following: Building the Docker image is done as follows:
``` ```
docker build . -t gatus docker build . -t gatus
@ -194,33 +196,37 @@ services:
- type: slack - type: slack
enabled: true enabled: true
description: "healthcheck failed 3 times in a row" description: "healthcheck failed 3 times in a row"
send-on-resolved: true
- type: slack - type: slack
enabled: true enabled: true
threshold: 5 threshold: 5
description: "healthcheck failed 5 times in a row" description: "healthcheck failed 5 times in a row"
send-on-resolved: true
conditions: conditions:
- "[STATUS] == 200" - "[STATUS] == 200"
- "[BODY].status == UP" - "[BODY].status == UP"
- "[RESPONSE_TIME] < 300" - "[RESPONSE_TIME] < 300"
``` ```
Here's an example of what the notifications look like:
![Slack notifications](.github/assets/slack-alerts.png)
### Configuring Twilio alerts ### Configuring Twilio alerts
```yaml ```yaml
alerting: alerting:
twilio: twilio:
sid: **** sid: "..."
token: **** token: "..."
from: +1-234-567-8901 from: "+1-234-567-8901"
to: +1-234-567-8901 to: "+1-234-567-8901"
services: services:
- name: twinnation - name: twinnation
interval: 30s interval: 30s
url: "https://twinnation.org/health" url: "https://twinnation.org/health"
alerts: alerts:
- type: twilio
enabled: true
description: "healthcheck failed 3 times in a row"
- type: twilio - type: twilio
enabled: true enabled: true
threshold: 5 threshold: 5
@ -242,7 +248,10 @@ would then check if the service that started failing was recently deployed, and
roll it back. roll it back.
The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the
service name respectively in the body (`alerting.custom.body`) and the url (`alerting.custom.url`). service name respectively in the body (`alerting.custom.body`) as well as the url (`alerting.custom.url`).
If you have `send-on-resolved` set to `true`, you may want to use `[ALERT_TRIGGERED_OR_RESOLVED]` to differentiate
the notifications. It will be replaced for either `TRIGGERED` or `RESOLVED`, based on the situation.
For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want. For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want.
@ -253,7 +262,7 @@ alerting:
method: "POST" method: "POST"
body: | body: |
{ {
"text": "[SERVICE_NAME] - [ALERT_DESCRIPTION]" "text": "[ALERT_TRIGGERED_OR_RESOLVED]: [SERVICE_NAME] - [ALERT_DESCRIPTION]"
} }
services: services:
- name: twinnation - name: twinnation
@ -263,6 +272,7 @@ services:
- type: custom - type: custom
enabled: true enabled: true
threshold: 10 threshold: 10
send-on-resolved: true
description: "healthcheck failed 10 times in a row" description: "healthcheck failed 10 times in a row"
conditions: conditions:
- "[STATUS] == 200" - "[STATUS] == 200"

View File

@ -22,6 +22,7 @@ var (
type Config struct { type Config struct {
Metrics bool `yaml:"metrics"` Metrics bool `yaml:"metrics"`
Debug bool `yaml:"debug"`
Alerting *core.AlertingConfig `yaml:"alerting"` Alerting *core.AlertingConfig `yaml:"alerting"`
Services []*core.Service `yaml:"services"` Services []*core.Service `yaml:"services"`
} }

View File

@ -13,6 +13,9 @@ type Alert struct {
// Description of the alert. Will be included in the alert sent. // Description of the alert. Will be included in the alert sent.
Description string `yaml:"description"` Description string `yaml:"description"`
// SendOnResolved defines whether to send a second notification when the issue has been resolved
SendOnResolved bool `yaml:"send-on-resolved"`
} }
type AlertType string type AlertType string

View File

@ -2,9 +2,11 @@ package core
import ( import (
"bytes" "bytes"
"encoding/base64"
"fmt" "fmt"
"github.com/TwinProduction/gatus/client" "github.com/TwinProduction/gatus/client"
"net/http" "net/http"
"net/url"
"strings" "strings"
) )
@ -21,6 +23,10 @@ type TwilioAlertProvider struct {
To string `yaml:"to"` To string `yaml:"to"`
} }
func (provider *TwilioAlertProvider) IsValid() bool {
return len(provider.Token) > 0 && len(provider.SID) > 0 && len(provider.From) > 0 && len(provider.To) > 0
}
type CustomAlertProvider struct { type CustomAlertProvider struct {
Url string `yaml:"url"` Url string `yaml:"url"`
Method string `yaml:"method,omitempty"` Method string `yaml:"method,omitempty"`
@ -28,31 +34,49 @@ type CustomAlertProvider struct {
Headers map[string]string `yaml:"headers,omitempty"` Headers map[string]string `yaml:"headers,omitempty"`
} }
func (provider *CustomAlertProvider) buildRequest(serviceName, alertDescription string) *http.Request { func (provider *CustomAlertProvider) IsValid() bool {
return len(provider.Url) > 0
}
func (provider *CustomAlertProvider) buildRequest(serviceName, alertDescription string, resolved bool) *http.Request {
body := provider.Body body := provider.Body
url := provider.Url providerUrl := provider.Url
if strings.Contains(provider.Body, "[ALERT_DESCRIPTION]") { if strings.Contains(body, "[ALERT_DESCRIPTION]") {
body = strings.ReplaceAll(provider.Body, "[ALERT_DESCRIPTION]", alertDescription) body = strings.ReplaceAll(body, "[ALERT_DESCRIPTION]", alertDescription)
} }
if strings.Contains(provider.Body, "[SERVICE_NAME]") { if strings.Contains(body, "[SERVICE_NAME]") {
body = strings.ReplaceAll(provider.Body, "[SERVICE_NAME]", serviceName) body = strings.ReplaceAll(body, "[SERVICE_NAME]", serviceName)
} }
if strings.Contains(provider.Url, "[ALERT_DESCRIPTION]") { if strings.Contains(body, "[ALERT_TRIGGERED_OR_RESOLVED]") {
url = strings.ReplaceAll(provider.Url, "[ALERT_DESCRIPTION]", alertDescription) if resolved {
body = strings.ReplaceAll(body, "[ALERT_TRIGGERED_OR_RESOLVED]", "RESOLVED")
} else {
body = strings.ReplaceAll(body, "[ALERT_TRIGGERED_OR_RESOLVED]", "TRIGGERED")
}
}
if strings.Contains(providerUrl, "[ALERT_DESCRIPTION]") {
providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_DESCRIPTION]", alertDescription)
}
if strings.Contains(providerUrl, "[SERVICE_NAME]") {
providerUrl = strings.ReplaceAll(providerUrl, "[SERVICE_NAME]", serviceName)
}
if strings.Contains(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]") {
if resolved {
providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]", "RESOLVED")
} else {
providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]", "TRIGGERED")
} }
if strings.Contains(provider.Url, "[SERVICE_NAME]") {
url = strings.ReplaceAll(provider.Url, "[SERVICE_NAME]", serviceName)
} }
bodyBuffer := bytes.NewBuffer([]byte(body)) bodyBuffer := bytes.NewBuffer([]byte(body))
request, _ := http.NewRequest(provider.Method, url, bodyBuffer) request, _ := http.NewRequest(provider.Method, providerUrl, bodyBuffer)
for k, v := range provider.Headers { for k, v := range provider.Headers {
request.Header.Set(k, v) request.Header.Set(k, v)
} }
return request return request
} }
func (provider *CustomAlertProvider) Send(serviceName, alertDescription string) error { func (provider *CustomAlertProvider) Send(serviceName, alertDescription string, resolved bool) error {
request := provider.buildRequest(serviceName, alertDescription) request := provider.buildRequest(serviceName, alertDescription, resolved)
response, err := client.GetHttpClient().Do(request) response, err := client.GetHttpClient().Do(request)
if err != nil { if err != nil {
return err return err
@ -62,3 +86,64 @@ func (provider *CustomAlertProvider) Send(serviceName, alertDescription string)
} }
return nil return nil
} }
func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider {
var message string
var color string
if resolved {
message = fmt.Sprintf("An alert for *%s* has been resolved after %d failures in a row", service.Name, service.NumberOfFailuresInARow)
color = "#36A64F"
} else {
message = fmt.Sprintf("An alert for *%s* has been triggered", service.Name)
color = "#DD0000"
}
var results string
for _, conditionResult := range result.ConditionResults {
var prefix string
if conditionResult.Success {
prefix = ":heavy_check_mark:"
} else {
prefix = ":x:"
}
results += fmt.Sprintf("%s - `%s`\n", prefix, conditionResult.Condition)
}
return &CustomAlertProvider{
Url: slackWebHookUrl,
Method: "POST",
Body: fmt.Sprintf(`{
"text": "",
"attachments": [
{
"title": ":helmet_with_white_cross: Gatus",
"text": "%s:\n> %s",
"short": false,
"color": "%s",
"fields": [
{
"title": "Condition results",
"value": "%s",
"short": false
}
]
},
]
}`, message, alert.Description, color, results),
Headers: map[string]string{"Content-Type": "application/json"},
}
}
func CreateTwilioCustomAlertProvider(provider *TwilioAlertProvider, message string) *CustomAlertProvider {
return &CustomAlertProvider{
Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", provider.SID),
Method: "POST",
Body: url.Values{
"To": {provider.To},
"From": {provider.From},
"Body": {message},
}.Encode(),
Headers: map[string]string{
"Content-Type": "application/x-www-form-urlencoded",
"Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", provider.SID, provider.Token)))),
},
}
}

View File

@ -46,7 +46,7 @@ type Service struct {
// Alerts is the alerting configuration for the service in case of failure // Alerts is the alerting configuration for the service in case of failure
Alerts []*Alert `yaml:"alerts"` Alerts []*Alert `yaml:"alerts"`
numberOfFailuresInARow int NumberOfFailuresInARow int
} }
func (service *Service) Validate() { func (service *Service) Validate() {
@ -94,22 +94,16 @@ func (service *Service) EvaluateConditions() *Result {
} }
} }
result.Timestamp = time.Now() result.Timestamp = time.Now()
if result.Success {
service.numberOfFailuresInARow = 0
// TODO: Send notification that alert has been resolved?
} else {
service.numberOfFailuresInARow++
}
return result return result
} }
func (service *Service) GetAlertsTriggered() []Alert { func (service *Service) GetAlertsTriggered() []Alert {
var alerts []Alert var alerts []Alert
if service.numberOfFailuresInARow == 0 { if service.NumberOfFailuresInARow == 0 {
return alerts return alerts
} }
for _, alert := range service.Alerts { for _, alert := range service.Alerts {
if alert.Enabled && alert.Threshold == service.numberOfFailuresInARow { if alert.Enabled && alert.Threshold == service.NumberOfFailuresInARow {
alerts = append(alerts, *alert) alerts = append(alerts, *alert)
continue continue
} }

1
go.sum
View File

@ -18,6 +18,7 @@ github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/me
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=

View File

@ -3,7 +3,6 @@ package main
import ( import (
"bytes" "bytes"
"compress/gzip" "compress/gzip"
"encoding/json"
"github.com/TwinProduction/gatus/config" "github.com/TwinProduction/gatus/config"
"github.com/TwinProduction/gatus/watchdog" "github.com/TwinProduction/gatus/watchdog"
"github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/client_golang/prometheus/promhttp"
@ -53,12 +52,11 @@ func serviceResultsHandler(writer http.ResponseWriter, r *http.Request) {
if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > CacheTTL; isExpired { if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > CacheTTL; isExpired {
buffer := &bytes.Buffer{} buffer := &bytes.Buffer{}
gzipWriter := gzip.NewWriter(buffer) gzipWriter := gzip.NewWriter(buffer)
serviceResults := watchdog.GetServiceResults() data, err := watchdog.GetJsonEncodedServiceResults()
data, err := json.Marshal(serviceResults)
if err != nil { if err != nil {
log.Printf("[main][serviceResultsHandler] Unable to marshall object to JSON: %s", err.Error()) log.Printf("[main][serviceResultsHandler] Unable to marshal object to JSON: %s", err.Error())
writer.WriteHeader(http.StatusInternalServerError) writer.WriteHeader(http.StatusInternalServerError)
_, _ = writer.Write([]byte("Unable to marshall object to JSON")) _, _ = writer.Write([]byte("Unable to marshal object to JSON"))
return return
} }
gzipWriter.Write(data) gzipWriter.Write(data)

View File

@ -1,25 +1,34 @@
package watchdog package watchdog
import ( import (
"encoding/base64" "encoding/json"
"fmt" "fmt"
"github.com/TwinProduction/gatus/config" "github.com/TwinProduction/gatus/config"
"github.com/TwinProduction/gatus/core" "github.com/TwinProduction/gatus/core"
"github.com/TwinProduction/gatus/metric" "github.com/TwinProduction/gatus/metric"
"log" "log"
"net/url"
"sync" "sync"
"time" "time"
) )
var ( var (
serviceResults = make(map[string][]*core.Result) serviceResults = make(map[string][]*core.Result)
rwLock sync.RWMutex
// serviceResultsMutex is used to prevent concurrent map access
serviceResultsMutex sync.RWMutex
// monitoringMutex is used to prevent multiple services from being evaluated at the same time.
// Without this, conditions using response time may become inaccurate.
monitoringMutex sync.Mutex
) )
// GetServiceResults returns a list of the last 20 results for each services // GetJsonEncodedServiceResults returns a list of the last 20 results for each services encoded using json.Marshal.
func GetServiceResults() *map[string][]*core.Result { // The reason why the encoding is done here is because we use a mutex to prevent concurrent map access.
return &serviceResults func GetJsonEncodedServiceResults() ([]byte, error) {
serviceResultsMutex.RLock()
data, err := json.Marshal(serviceResults)
serviceResultsMutex.RUnlock()
return data, err
} }
// Monitor loops over each services and starts a goroutine to monitor each services separately // Monitor loops over each services and starts a goroutine to monitor each services separately
@ -33,71 +42,72 @@ func Monitor(cfg *config.Config) {
// monitor monitors a single service in a loop // monitor monitors a single service in a loop
func monitor(service *core.Service) { func monitor(service *core.Service) {
cfg := config.Get()
for { for {
// By placing the lock here, we prevent multiple services from being monitored at the exact same time, which // By placing the lock here, we prevent multiple services from being monitored at the exact same time, which
// could cause performance issues and return inaccurate results // could cause performance issues and return inaccurate results
rwLock.Lock() monitoringMutex.Lock()
if cfg.Debug {
log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name) log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name)
}
result := service.EvaluateConditions() result := service.EvaluateConditions()
metric.PublishMetricsForService(service, result) metric.PublishMetricsForService(service, result)
serviceResultsMutex.Lock()
serviceResults[service.Name] = append(serviceResults[service.Name], result) serviceResults[service.Name] = append(serviceResults[service.Name], result)
if len(serviceResults[service.Name]) > 20 { if len(serviceResults[service.Name]) > 20 {
serviceResults[service.Name] = serviceResults[service.Name][1:] serviceResults[service.Name] = serviceResults[service.Name][1:]
} }
rwLock.Unlock() serviceResultsMutex.Unlock()
var extra string var extra string
if !result.Success { if !result.Success {
extra = fmt.Sprintf("responseBody=%s", result.Body) extra = fmt.Sprintf("responseBody=%s", result.Body)
} }
log.Printf( log.Printf(
"[watchdog][monitor] Finished monitoring serviceName=%s; errors=%d; requestDuration=%s; %s", "[watchdog][monitor] Monitored serviceName=%s; success=%v; errors=%d; requestDuration=%s; %s",
service.Name, service.Name,
result.Success,
len(result.Errors), len(result.Errors),
result.Duration.Round(time.Millisecond), result.Duration.Round(time.Millisecond),
extra, extra,
) )
handleAlerting(service, result)
if cfg.Debug {
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name)
}
monitoringMutex.Unlock()
time.Sleep(service.Interval)
}
}
func handleAlerting(service *core.Service, result *core.Result) {
cfg := config.Get() cfg := config.Get()
if cfg.Alerting != nil { if cfg.Alerting == nil {
for _, alertTriggered := range service.GetAlertsTriggered() { return
}
if result.Success {
if service.NumberOfFailuresInARow > 0 {
for _, alert := range service.Alerts {
if !alert.Enabled || !alert.SendOnResolved || alert.Threshold > service.NumberOfFailuresInARow {
continue
}
var alertProvider *core.CustomAlertProvider var alertProvider *core.CustomAlertProvider
if alertTriggered.Type == core.SlackAlert { if alert.Type == core.SlackAlert {
if len(cfg.Alerting.Slack) > 0 { if len(cfg.Alerting.Slack) > 0 {
log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been triggered", alertTriggered.Description) log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been resolved", alert.Description)
alertProvider = &core.CustomAlertProvider{ alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true)
Url: cfg.Alerting.Slack,
Method: "POST",
Body: fmt.Sprintf(`{"text":"*[Gatus]*\n*service:* %s\n*description:* %s"}`, service.Name, alertTriggered.Description),
Headers: map[string]string{"Content-Type": "application/json"},
}
} else { } else {
log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
}
} else if alertTriggered.Type == core.TwilioAlert {
if len(cfg.Alerting.Twilio.Token) > 0 &&
len(cfg.Alerting.Twilio.SID) > 0 &&
len(cfg.Alerting.Twilio.From) > 0 &&
len(cfg.Alerting.Twilio.To) > 0 {
log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alertTriggered.Description)
alertProvider = &core.CustomAlertProvider{
Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", cfg.Alerting.Twilio.SID),
Method: "POST",
Body: url.Values{
"To": {cfg.Alerting.Twilio.To},
"From": {cfg.Alerting.Twilio.From},
"Body": {fmt.Sprintf("%s - %s", service.Name, alertTriggered.Description)},
}.Encode(),
Headers: map[string]string{
"Content-Type": "application/x-www-form-urlencoded",
"Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", cfg.Alerting.Twilio.SID, cfg.Alerting.Twilio.Token)))),
},
} }
} else if alert.Type == core.TwilioAlert {
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been resolved", alert.Description)
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description))
} else { } else {
log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because twilio config settings missing") log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being resolved, because Twilio isn't configured properly")
} }
} else if alertTriggered.Type == core.CustomAlert { } else if alert.Type == core.CustomAlert {
if cfg.Alerting.Custom != nil && len(cfg.Alerting.Custom.Url) > 0 { if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
log.Printf("[watchdog][monitor] Sending custom alert because alert with description=%s has been triggered", alertTriggered.Description) log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been resolved", alert.Description)
alertProvider = &core.CustomAlertProvider{ alertProvider = &core.CustomAlertProvider{
Url: cfg.Alerting.Custom.Url, Url: cfg.Alerting.Custom.Url,
Method: cfg.Alerting.Custom.Method, Method: cfg.Alerting.Custom.Method,
@ -105,19 +115,59 @@ func monitor(service *core.Service) {
Headers: cfg.Alerting.Custom.Headers, Headers: cfg.Alerting.Custom.Headers,
} }
} else { } else {
log.Printf("[watchdog][monitor] Not sending custom alert despite being triggered, because there is no custom url configured") log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being resolved, because the custom provider isn't configured properly")
} }
} }
if alertProvider != nil { if alertProvider != nil {
err := alertProvider.Send(service.Name, alertTriggered.Description) err := alertProvider.Send(service.Name, alert.Description, true)
if err != nil { if err != nil {
log.Printf("[watchdog][monitor] Ran into error sending an alert: %s", err.Error()) log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error())
} }
} }
} }
} }
service.NumberOfFailuresInARow = 0
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s", service.Interval, service.Name) } else {
time.Sleep(service.Interval) service.NumberOfFailuresInARow++
for _, alert := range service.Alerts {
// If the alert hasn't been triggered, move to the next one
if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow {
continue
}
var alertProvider *core.CustomAlertProvider
if alert.Type == core.SlackAlert {
if len(cfg.Alerting.Slack) > 0 {
log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been triggered", alert.Description)
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false)
} else {
log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
}
} else if alert.Type == core.TwilioAlert {
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been triggered", alert.Description)
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description))
} else {
log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being triggered, because Twilio config settings missing")
}
} else if alert.Type == core.CustomAlert {
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been triggered", alert.Description)
alertProvider = &core.CustomAlertProvider{
Url: cfg.Alerting.Custom.Url,
Method: cfg.Alerting.Custom.Method,
Body: cfg.Alerting.Custom.Body,
Headers: cfg.Alerting.Custom.Headers,
}
} else {
log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being triggered, because there is no custom url configured")
}
}
if alertProvider != nil {
err := alertProvider.Send(service.Name, alert.Description, false)
if err != nil {
log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error())
}
}
}
} }
} }