Add support for PagerDuty
This commit is contained in:
parent
cf6a74f862
commit
75b7a41c9d
295
README.md
295
README.md
@ -17,14 +17,16 @@ core applications: https://status.twinnation.org/
|
|||||||
- [Usage](#usage)
|
- [Usage](#usage)
|
||||||
- [Configuration](#configuration)
|
- [Configuration](#configuration)
|
||||||
- [Conditions](#conditions)
|
- [Conditions](#conditions)
|
||||||
|
- [Alerting](#alerting)
|
||||||
|
- [Configuring Slack alerts](#configuring-slack-alerts)
|
||||||
|
- [Configuring PagerDuty alerts](#configuring-pagerduty-alerts)
|
||||||
|
- [Configuring Twilio alerts](#configuring-twilio-alerts)
|
||||||
|
- [Configuring custom alerts](#configuring-custom-alerts)
|
||||||
- [Docker](#docker)
|
- [Docker](#docker)
|
||||||
- [Running the tests](#running-the-tests)
|
- [Running the tests](#running-the-tests)
|
||||||
- [Using in Production](#using-in-production)
|
- [Using in Production](#using-in-production)
|
||||||
- [FAQ](#faq)
|
- [FAQ](#faq)
|
||||||
- [Sending a GraphQL request](#sending-a-graphql-request)
|
- [Sending a GraphQL request](#sending-a-graphql-request)
|
||||||
- [Configuring Slack alerts](#configuring-slack-alerts)
|
|
||||||
- [Configuring Twilio alerts](#configuring-twilio-alerts)
|
|
||||||
- [Configuring custom alerts](#configuring-custom-alerts)
|
|
||||||
|
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
@ -72,35 +74,36 @@ Note that you can also add environment variables in the configuration file (i.e.
|
|||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
|
||||||
| Parameter | Description | Default |
|
| Parameter | Description | Default |
|
||||||
| -------------------------------------- | --------------------------------------------------------------- | -------------- |
|
| --------------------------------------------- | -------------------------------------------------------------------------- | -------------- |
|
||||||
| `debug` | Whether to enable debug logs | `false` |
|
| `debug` | Whether to enable debug logs | `false` |
|
||||||
| `metrics` | Whether to expose metrics at /metrics | `false` |
|
| `metrics` | Whether to expose metrics at /metrics | `false` |
|
||||||
| `services` | List of services to monitor | Required `[]` |
|
| `services` | List of services to monitor | Required `[]` |
|
||||||
| `services[].name` | Name of the service. Can be anything. | Required `""` |
|
| `services[].name` | Name of the service. Can be anything. | Required `""` |
|
||||||
| `services[].url` | URL to send the request to | Required `""` |
|
| `services[].url` | URL to send the request to | Required `""` |
|
||||||
| `services[].conditions` | Conditions used to determine the health of the service | `[]` |
|
| `services[].conditions` | Conditions used to determine the health of the service | `[]` |
|
||||||
| `services[].interval` | Duration to wait between every status check | `60s` |
|
| `services[].interval` | Duration to wait between every status check | `60s` |
|
||||||
| `services[].method` | Request method | `GET` |
|
| `services[].method` | Request method | `GET` |
|
||||||
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
|
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
|
||||||
| `services[].body` | Request body | `""` |
|
| `services[].body` | Request body | `""` |
|
||||||
| `services[].headers` | Request headers | `{}` |
|
| `services[].headers` | Request headers | `{}` |
|
||||||
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
|
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
|
||||||
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
|
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
|
||||||
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
|
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
|
||||||
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
|
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
|
||||||
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` |
|
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` |
|
||||||
| `alerting` | Configuration for alerting | `{}` |
|
| `services[].alerts[].success-before-resolved` | Number of successes in a row needed before sending a resolved notification | `2` |
|
||||||
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
|
| `alerting` | Configuration for alerting | `{}` |
|
||||||
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
|
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
|
||||||
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
|
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
|
||||||
| `alerting.twilio.token` | Twilio auth token | Required `""` |
|
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
|
||||||
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
|
| `alerting.twilio.token` | Twilio auth token | Required `""` |
|
||||||
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
|
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
|
||||||
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
|
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
|
||||||
| `alerting.custom.url` | Custom alerting request url | `""` |
|
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
|
||||||
| `alerting.custom.body` | Custom alerting request body. | `""` |
|
| `alerting.custom.url` | Custom alerting request url | `""` |
|
||||||
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
|
| `alerting.custom.body` | Custom alerting request body. | `""` |
|
||||||
|
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
|
||||||
|
|
||||||
|
|
||||||
### Conditions
|
### Conditions
|
||||||
@ -121,6 +124,136 @@ Here are some examples of conditions you can use:
|
|||||||
| `len([BODY].name) == 8` | String at jsonpath `$.name` has a length of 8 | `{"name":"john.doe"}` | `{"name":"bob"}` |
|
| `len([BODY].name) == 8` | String at jsonpath `$.name` has a length of 8 | `{"name":"john.doe"}` | `{"name":"bob"}` |
|
||||||
|
|
||||||
|
|
||||||
|
### Alerting
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### Configuring Slack alerts
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
alerting:
|
||||||
|
slack: "https://hooks.slack.com/services/**********/**********/**********"
|
||||||
|
services:
|
||||||
|
- name: twinnation
|
||||||
|
interval: 30s
|
||||||
|
url: "https://twinnation.org/health"
|
||||||
|
alerts:
|
||||||
|
- type: slack
|
||||||
|
enabled: true
|
||||||
|
description: "healthcheck failed 3 times in a row"
|
||||||
|
send-on-resolved: true
|
||||||
|
- type: slack
|
||||||
|
enabled: true
|
||||||
|
threshold: 5
|
||||||
|
description: "healthcheck failed 5 times in a row"
|
||||||
|
send-on-resolved: true
|
||||||
|
conditions:
|
||||||
|
- "[STATUS] == 200"
|
||||||
|
- "[BODY].status == UP"
|
||||||
|
- "[RESPONSE_TIME] < 300"
|
||||||
|
```
|
||||||
|
|
||||||
|
Here's an example of what the notifications look like:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
#### Configuring PagerDuty alerts
|
||||||
|
|
||||||
|
It is highly recommended to set `services[].alerts[].send-on-resolved` to `true` for alerts
|
||||||
|
of type `pagerduty`, because unlike other alerts, the operation resulting from setting said
|
||||||
|
parameter to `true` will not create another incident, but mark the incident as resolved on
|
||||||
|
PagerDuty instead.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
alerting:
|
||||||
|
pagerduty: "********************************"
|
||||||
|
services:
|
||||||
|
- name: twinnation
|
||||||
|
interval: 30s
|
||||||
|
url: "https://twinnation.org/health"
|
||||||
|
alerts:
|
||||||
|
- type: pagerduty
|
||||||
|
enabled: true
|
||||||
|
threshold: 3
|
||||||
|
description: "healthcheck failed 3 times in a row"
|
||||||
|
send-on-resolved: true
|
||||||
|
success-before-resolved: 5
|
||||||
|
conditions:
|
||||||
|
- "[STATUS] == 200"
|
||||||
|
- "[BODY].status == UP"
|
||||||
|
- "[RESPONSE_TIME] < 300"
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Configuring Twilio alerts
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
alerting:
|
||||||
|
twilio:
|
||||||
|
sid: "..."
|
||||||
|
token: "..."
|
||||||
|
from: "+1-234-567-8901"
|
||||||
|
to: "+1-234-567-8901"
|
||||||
|
services:
|
||||||
|
- name: twinnation
|
||||||
|
interval: 30s
|
||||||
|
url: "https://twinnation.org/health"
|
||||||
|
alerts:
|
||||||
|
- type: twilio
|
||||||
|
enabled: true
|
||||||
|
threshold: 5
|
||||||
|
description: "healthcheck failed 5 times in a row"
|
||||||
|
conditions:
|
||||||
|
- "[STATUS] == 200"
|
||||||
|
- "[BODY].status == UP"
|
||||||
|
- "[RESPONSE_TIME] < 300"
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Configuring custom alerts
|
||||||
|
|
||||||
|
While they're called alerts, you can use this feature to call anything.
|
||||||
|
|
||||||
|
For instance, you could automate rollbacks by having an application that keeps tracks of new deployments, and by
|
||||||
|
leveraging Gatus, you could have Gatus call that application endpoint when a service starts failing. Your application
|
||||||
|
would then check if the service that started failing was recently deployed, and if it was, then automatically
|
||||||
|
roll it back.
|
||||||
|
|
||||||
|
The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the
|
||||||
|
service name respectively in the body (`alerting.custom.body`) as well as the url (`alerting.custom.url`).
|
||||||
|
|
||||||
|
If you have `send-on-resolved` set to `true`, you may want to use `[ALERT_TRIGGERED_OR_RESOLVED]` to differentiate
|
||||||
|
the notifications. It will be replaced for either `TRIGGERED` or `RESOLVED`, based on the situation.
|
||||||
|
|
||||||
|
For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
alerting:
|
||||||
|
custom:
|
||||||
|
url: "https://hooks.slack.com/services/**********/**********/**********"
|
||||||
|
method: "POST"
|
||||||
|
body: |
|
||||||
|
{
|
||||||
|
"text": "[ALERT_TRIGGERED_OR_RESOLVED]: [SERVICE_NAME] - [ALERT_DESCRIPTION]"
|
||||||
|
}
|
||||||
|
services:
|
||||||
|
- name: twinnation
|
||||||
|
interval: 30s
|
||||||
|
url: "https://twinnation.org/health"
|
||||||
|
alerts:
|
||||||
|
- type: custom
|
||||||
|
enabled: true
|
||||||
|
threshold: 10
|
||||||
|
send-on-resolved: true
|
||||||
|
description: "healthcheck failed 10 times in a row"
|
||||||
|
conditions:
|
||||||
|
- "[STATUS] == 200"
|
||||||
|
- "[BODY].status == UP"
|
||||||
|
- "[RESPONSE_TIME] < 300"
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Docker
|
## Docker
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -173,101 +306,3 @@ will send a `POST` request to `http://localhost:8080/playground` with the follow
|
|||||||
```json
|
```json
|
||||||
{"query":" {\n user(gender: \"female\") {\n id\n name\n gender\n avatar\n }\n }"}
|
{"query":" {\n user(gender: \"female\") {\n id\n name\n gender\n avatar\n }\n }"}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
### Configuring Slack alerts
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
alerting:
|
|
||||||
slack: "https://hooks.slack.com/services/**********/**********/**********"
|
|
||||||
services:
|
|
||||||
- name: twinnation
|
|
||||||
interval: 30s
|
|
||||||
url: "https://twinnation.org/health"
|
|
||||||
alerts:
|
|
||||||
- type: slack
|
|
||||||
enabled: true
|
|
||||||
description: "healthcheck failed 3 times in a row"
|
|
||||||
send-on-resolved: true
|
|
||||||
- type: slack
|
|
||||||
enabled: true
|
|
||||||
threshold: 5
|
|
||||||
description: "healthcheck failed 5 times in a row"
|
|
||||||
send-on-resolved: true
|
|
||||||
conditions:
|
|
||||||
- "[STATUS] == 200"
|
|
||||||
- "[BODY].status == UP"
|
|
||||||
- "[RESPONSE_TIME] < 300"
|
|
||||||
```
|
|
||||||
|
|
||||||
Here's an example of what the notifications look like:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
|
|
||||||
### Configuring Twilio alerts
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
alerting:
|
|
||||||
twilio:
|
|
||||||
sid: "..."
|
|
||||||
token: "..."
|
|
||||||
from: "+1-234-567-8901"
|
|
||||||
to: "+1-234-567-8901"
|
|
||||||
services:
|
|
||||||
- name: twinnation
|
|
||||||
interval: 30s
|
|
||||||
url: "https://twinnation.org/health"
|
|
||||||
alerts:
|
|
||||||
- type: twilio
|
|
||||||
enabled: true
|
|
||||||
threshold: 5
|
|
||||||
description: "healthcheck failed 5 times in a row"
|
|
||||||
conditions:
|
|
||||||
- "[STATUS] == 200"
|
|
||||||
- "[BODY].status == UP"
|
|
||||||
- "[RESPONSE_TIME] < 300"
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
### Configuring custom alerts
|
|
||||||
|
|
||||||
While they're called alerts, you can use this feature to call anything.
|
|
||||||
|
|
||||||
For instance, you could automate rollbacks by having an application that keeps tracks of new deployments, and by
|
|
||||||
leveraging Gatus, you could have Gatus call that application endpoint when a service starts failing. Your application
|
|
||||||
would then check if the service that started failing was recently deployed, and if it was, then automatically
|
|
||||||
roll it back.
|
|
||||||
|
|
||||||
The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the
|
|
||||||
service name respectively in the body (`alerting.custom.body`) as well as the url (`alerting.custom.url`).
|
|
||||||
|
|
||||||
If you have `send-on-resolved` set to `true`, you may want to use `[ALERT_TRIGGERED_OR_RESOLVED]` to differentiate
|
|
||||||
the notifications. It will be replaced for either `TRIGGERED` or `RESOLVED`, based on the situation.
|
|
||||||
|
|
||||||
For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
alerting:
|
|
||||||
custom:
|
|
||||||
url: "https://hooks.slack.com/services/**********/**********/**********"
|
|
||||||
method: "POST"
|
|
||||||
body: |
|
|
||||||
{
|
|
||||||
"text": "[ALERT_TRIGGERED_OR_RESOLVED]: [SERVICE_NAME] - [ALERT_DESCRIPTION]"
|
|
||||||
}
|
|
||||||
services:
|
|
||||||
- name: twinnation
|
|
||||||
interval: 30s
|
|
||||||
url: "https://twinnation.org/health"
|
|
||||||
alerts:
|
|
||||||
- type: custom
|
|
||||||
enabled: true
|
|
||||||
threshold: 10
|
|
||||||
send-on-resolved: true
|
|
||||||
description: "healthcheck failed 10 times in a row"
|
|
||||||
conditions:
|
|
||||||
- "[STATUS] == 200"
|
|
||||||
- "[BODY].status == UP"
|
|
||||||
- "[RESPONSE_TIME] < 300"
|
|
||||||
```
|
|
||||||
|
158
alerting/alerting.go
Normal file
158
alerting/alerting.go
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
package alerting
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"github.com/TwinProduction/gatus/config"
|
||||||
|
"github.com/TwinProduction/gatus/core"
|
||||||
|
"log"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Handle takes care of alerts to resolve and alerts to trigger based on result success or failure
|
||||||
|
func Handle(service *core.Service, result *core.Result) {
|
||||||
|
cfg := config.Get()
|
||||||
|
if cfg.Alerting == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if result.Success {
|
||||||
|
handleAlertsToResolve(service, result, cfg)
|
||||||
|
} else {
|
||||||
|
handleAlertsToTrigger(service, result, cfg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func handleAlertsToTrigger(service *core.Service, result *core.Result, cfg *config.Config) {
|
||||||
|
service.NumberOfSuccessesInARow = 0
|
||||||
|
service.NumberOfFailuresInARow++
|
||||||
|
for _, alert := range service.Alerts {
|
||||||
|
// If the alert hasn't been triggered, move to the next one
|
||||||
|
if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if alert.Triggered {
|
||||||
|
if cfg.Debug {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Alert with description='%s' has already been triggered, skipping", alert.Description)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var alertProvider *core.CustomAlertProvider
|
||||||
|
if alert.Type == core.SlackAlert {
|
||||||
|
if len(cfg.Alerting.Slack) > 0 {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Sending Slack alert because alert with description='%s' has been triggered", alert.Description)
|
||||||
|
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false)
|
||||||
|
} else {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
|
||||||
|
}
|
||||||
|
} else if alert.Type == core.PagerDutyAlert {
|
||||||
|
if len(cfg.Alerting.PagerDuty) > 0 {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Sending PagerDuty alert because alert with description='%s' has been triggered", alert.Description)
|
||||||
|
alertProvider = core.CreatePagerDutyCustomAlertProvider(cfg.Alerting.PagerDuty, "trigger", "", service, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description))
|
||||||
|
} else {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Not sending PagerDuty alert despite being triggered, because PagerDuty isn't configured properly")
|
||||||
|
}
|
||||||
|
} else if alert.Type == core.TwilioAlert {
|
||||||
|
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Sending Twilio alert because alert with description='%s' has been triggered", alert.Description)
|
||||||
|
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description))
|
||||||
|
} else {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Not sending Twilio alert despite being triggered, because Twilio config settings missing")
|
||||||
|
}
|
||||||
|
} else if alert.Type == core.CustomAlert {
|
||||||
|
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Sending custom alert because alert with description='%s' has been triggered", alert.Description)
|
||||||
|
alertProvider = &core.CustomAlertProvider{
|
||||||
|
Url: cfg.Alerting.Custom.Url,
|
||||||
|
Method: cfg.Alerting.Custom.Method,
|
||||||
|
Body: cfg.Alerting.Custom.Body,
|
||||||
|
Headers: cfg.Alerting.Custom.Headers,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Not sending custom alert despite being triggered, because there is no custom url configured")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if alertProvider != nil {
|
||||||
|
// TODO: retry on error
|
||||||
|
var err error
|
||||||
|
if alert.Type == core.PagerDutyAlert {
|
||||||
|
var body []byte
|
||||||
|
body, err = alertProvider.Send(service.Name, alert.Description, true)
|
||||||
|
if err == nil {
|
||||||
|
var response pagerDutyResponse
|
||||||
|
err = json.Unmarshal(body, &response)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Ran into error unmarshaling pager duty response: %s", err.Error())
|
||||||
|
} else {
|
||||||
|
alert.ResolveKey = response.DedupKey
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
_, err = alertProvider.Send(service.Name, alert.Description, false)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[alerting][handleAlertsToTrigger] Ran into error sending an alert: %s", err.Error())
|
||||||
|
} else {
|
||||||
|
alert.Triggered = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func handleAlertsToResolve(service *core.Service, result *core.Result, cfg *config.Config) {
|
||||||
|
service.NumberOfSuccessesInARow++
|
||||||
|
for _, alert := range service.Alerts {
|
||||||
|
if !alert.Enabled || !alert.Triggered || alert.SuccessBeforeResolved > service.NumberOfSuccessesInARow {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
alert.Triggered = false
|
||||||
|
if !alert.SendOnResolved {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var alertProvider *core.CustomAlertProvider
|
||||||
|
if alert.Type == core.SlackAlert {
|
||||||
|
if len(cfg.Alerting.Slack) > 0 {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Sending Slack alert because alert with description='%s' has been resolved", alert.Description)
|
||||||
|
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true)
|
||||||
|
} else {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Not sending Slack alert despite being resolved, because there is no Slack webhook configured")
|
||||||
|
}
|
||||||
|
} else if alert.Type == core.PagerDutyAlert {
|
||||||
|
if len(cfg.Alerting.PagerDuty) > 0 {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Sending PagerDuty alert because alert with description='%s' has been resolved", alert.Description)
|
||||||
|
alertProvider = core.CreatePagerDutyCustomAlertProvider(cfg.Alerting.PagerDuty, "resolve", alert.ResolveKey, service, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description))
|
||||||
|
} else {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Not sending PagerDuty alert despite being resolved, because PagerDuty isn't configured properly")
|
||||||
|
}
|
||||||
|
} else if alert.Type == core.TwilioAlert {
|
||||||
|
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Sending Twilio alert because alert with description='%s' has been resolved", alert.Description)
|
||||||
|
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description))
|
||||||
|
} else {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Not sending Twilio alert despite being resolved, because Twilio isn't configured properly")
|
||||||
|
}
|
||||||
|
} else if alert.Type == core.CustomAlert {
|
||||||
|
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Sending custom alert because alert with description='%s' has been resolved", alert.Description)
|
||||||
|
alertProvider = &core.CustomAlertProvider{
|
||||||
|
Url: cfg.Alerting.Custom.Url,
|
||||||
|
Method: cfg.Alerting.Custom.Method,
|
||||||
|
Body: cfg.Alerting.Custom.Body,
|
||||||
|
Headers: cfg.Alerting.Custom.Headers,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Not sending custom alert despite being resolved, because the custom provider isn't configured properly")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if alertProvider != nil {
|
||||||
|
// TODO: retry on error
|
||||||
|
_, err := alertProvider.Send(service.Name, alert.Description, true)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[alerting][handleAlertsToResolve] Ran into error sending an alert: %s", err.Error())
|
||||||
|
} else {
|
||||||
|
if alert.Type == core.PagerDutyAlert {
|
||||||
|
alert.ResolveKey = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
service.NumberOfFailuresInARow = 0
|
||||||
|
}
|
7
alerting/pagerduty.go
Normal file
7
alerting/pagerduty.go
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
package alerting
|
||||||
|
|
||||||
|
type pagerDutyResponse struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
DedupKey string `json:"dedup_key"`
|
||||||
|
}
|
@ -16,12 +16,24 @@ type Alert struct {
|
|||||||
|
|
||||||
// SendOnResolved defines whether to send a second notification when the issue has been resolved
|
// SendOnResolved defines whether to send a second notification when the issue has been resolved
|
||||||
SendOnResolved bool `yaml:"send-on-resolved"`
|
SendOnResolved bool `yaml:"send-on-resolved"`
|
||||||
|
|
||||||
|
// SuccessBeforeResolved defines whether to send a second notification when the issue has been resolved
|
||||||
|
SuccessBeforeResolved int `yaml:"success-before-resolved"`
|
||||||
|
|
||||||
|
// ResolveKey is an optional field that is used by some providers (i.e. PagerDuty's dedup_key) to resolve
|
||||||
|
// ongoing/triggered incidents
|
||||||
|
ResolveKey string
|
||||||
|
|
||||||
|
// Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value
|
||||||
|
// should be set back to false. It is used to prevent the same alert from going out twice.
|
||||||
|
Triggered bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type AlertType string
|
type AlertType string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
SlackAlert AlertType = "slack"
|
SlackAlert AlertType = "slack"
|
||||||
TwilioAlert AlertType = "twilio"
|
PagerDutyAlert AlertType = "pagerduty"
|
||||||
CustomAlert AlertType = "custom"
|
TwilioAlert AlertType = "twilio"
|
||||||
|
CustomAlert AlertType = "custom"
|
||||||
)
|
)
|
||||||
|
@ -5,15 +5,17 @@ import (
|
|||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/TwinProduction/gatus/client"
|
"github.com/TwinProduction/gatus/client"
|
||||||
|
"io/ioutil"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type AlertingConfig struct {
|
type AlertingConfig struct {
|
||||||
Slack string `yaml:"slack"`
|
Slack string `yaml:"slack"`
|
||||||
Twilio *TwilioAlertProvider `yaml:"twilio"`
|
PagerDuty string `yaml:"pagerduty"`
|
||||||
Custom *CustomAlertProvider `yaml:"custom"`
|
Twilio *TwilioAlertProvider `yaml:"twilio"`
|
||||||
|
Custom *CustomAlertProvider `yaml:"custom"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type TwilioAlertProvider struct {
|
type TwilioAlertProvider struct {
|
||||||
@ -75,26 +77,32 @@ func (provider *CustomAlertProvider) buildRequest(serviceName, alertDescription
|
|||||||
return request
|
return request
|
||||||
}
|
}
|
||||||
|
|
||||||
func (provider *CustomAlertProvider) Send(serviceName, alertDescription string, resolved bool) error {
|
// Send a request to the alert provider and return the body
|
||||||
|
func (provider *CustomAlertProvider) Send(serviceName, alertDescription string, resolved bool) ([]byte, error) {
|
||||||
request := provider.buildRequest(serviceName, alertDescription, resolved)
|
request := provider.buildRequest(serviceName, alertDescription, resolved)
|
||||||
response, err := client.GetHttpClient().Do(request)
|
response, err := client.GetHttpClient().Do(request)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return nil, err
|
||||||
}
|
}
|
||||||
if response.StatusCode > 399 {
|
if response.StatusCode > 399 {
|
||||||
return fmt.Errorf("call to provider alert returned status code %d", response.StatusCode)
|
body, err := ioutil.ReadAll(response.Body)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("call to provider alert returned status code %d", response.StatusCode)
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("call to provider alert returned status code %d: %s", response.StatusCode, string(body))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return ioutil.ReadAll(response.Body)
|
||||||
}
|
}
|
||||||
|
|
||||||
func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider {
|
func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider {
|
||||||
var message string
|
var message string
|
||||||
var color string
|
var color string
|
||||||
if resolved {
|
if resolved {
|
||||||
message = fmt.Sprintf("An alert for *%s* has been resolved after %d failures in a row", service.Name, service.NumberOfFailuresInARow)
|
message = fmt.Sprintf("An alert for *%s* has been resolved after passing successfully %d time(s) in a row", service.Name, alert.SuccessBeforeResolved)
|
||||||
color = "#36A64F"
|
color = "#36A64F"
|
||||||
} else {
|
} else {
|
||||||
message = fmt.Sprintf("An alert for *%s* has been triggered", service.Name)
|
message = fmt.Sprintf("An alert for *%s* has been triggered due to having failed %d time(s) in a row", service.Name, alert.Threshold)
|
||||||
color = "#DD0000"
|
color = "#DD0000"
|
||||||
}
|
}
|
||||||
var results string
|
var results string
|
||||||
@ -147,3 +155,24 @@ func CreateTwilioCustomAlertProvider(provider *TwilioAlertProvider, message stri
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://developer.pagerduty.com/docs/events-api-v2/trigger-events/
|
||||||
|
func CreatePagerDutyCustomAlertProvider(routingKey, eventAction, resolveKey string, service *Service, message string) *CustomAlertProvider {
|
||||||
|
return &CustomAlertProvider{
|
||||||
|
Url: "https://events.pagerduty.com/v2/enqueue",
|
||||||
|
Method: "POST",
|
||||||
|
Body: fmt.Sprintf(`{
|
||||||
|
"routing_key": "%s",
|
||||||
|
"dedup_key": "%s",
|
||||||
|
"event_action": "%s",
|
||||||
|
"payload": {
|
||||||
|
"summary": "%s",
|
||||||
|
"source": "%s",
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
}`, routingKey, resolveKey, eventAction, message, service.Name),
|
||||||
|
Headers: map[string]string{
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -46,7 +46,8 @@ type Service struct {
|
|||||||
// Alerts is the alerting configuration for the service in case of failure
|
// Alerts is the alerting configuration for the service in case of failure
|
||||||
Alerts []*Alert `yaml:"alerts"`
|
Alerts []*Alert `yaml:"alerts"`
|
||||||
|
|
||||||
NumberOfFailuresInARow int
|
NumberOfFailuresInARow int
|
||||||
|
NumberOfSuccessesInARow int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (service *Service) Validate() {
|
func (service *Service) Validate() {
|
||||||
@ -64,6 +65,9 @@ func (service *Service) Validate() {
|
|||||||
if alert.Threshold <= 0 {
|
if alert.Threshold <= 0 {
|
||||||
alert.Threshold = 3
|
alert.Threshold = 3
|
||||||
}
|
}
|
||||||
|
if alert.SuccessBeforeResolved <= 0 {
|
||||||
|
alert.SuccessBeforeResolved = 2
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if len(service.Url) == 0 {
|
if len(service.Url) == 0 {
|
||||||
panic(ErrNoUrl)
|
panic(ErrNoUrl)
|
||||||
|
@ -3,6 +3,7 @@ package watchdog
|
|||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/TwinProduction/gatus/alerting"
|
||||||
"github.com/TwinProduction/gatus/config"
|
"github.com/TwinProduction/gatus/config"
|
||||||
"github.com/TwinProduction/gatus/core"
|
"github.com/TwinProduction/gatus/core"
|
||||||
"github.com/TwinProduction/gatus/metric"
|
"github.com/TwinProduction/gatus/metric"
|
||||||
@ -70,7 +71,7 @@ func monitor(service *core.Service) {
|
|||||||
result.Duration.Round(time.Millisecond),
|
result.Duration.Round(time.Millisecond),
|
||||||
extra,
|
extra,
|
||||||
)
|
)
|
||||||
handleAlerting(service, result)
|
alerting.Handle(service, result)
|
||||||
if cfg.Debug {
|
if cfg.Debug {
|
||||||
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name)
|
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name)
|
||||||
}
|
}
|
||||||
@ -78,96 +79,3 @@ func monitor(service *core.Service) {
|
|||||||
time.Sleep(service.Interval)
|
time.Sleep(service.Interval)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func handleAlerting(service *core.Service, result *core.Result) {
|
|
||||||
cfg := config.Get()
|
|
||||||
if cfg.Alerting == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if result.Success {
|
|
||||||
if service.NumberOfFailuresInARow > 0 {
|
|
||||||
for _, alert := range service.Alerts {
|
|
||||||
if !alert.Enabled || !alert.SendOnResolved || alert.Threshold > service.NumberOfFailuresInARow {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
var alertProvider *core.CustomAlertProvider
|
|
||||||
if alert.Type == core.SlackAlert {
|
|
||||||
if len(cfg.Alerting.Slack) > 0 {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been resolved", alert.Description)
|
|
||||||
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true)
|
|
||||||
} else {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
|
|
||||||
}
|
|
||||||
} else if alert.Type == core.TwilioAlert {
|
|
||||||
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been resolved", alert.Description)
|
|
||||||
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description))
|
|
||||||
} else {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being resolved, because Twilio isn't configured properly")
|
|
||||||
}
|
|
||||||
} else if alert.Type == core.CustomAlert {
|
|
||||||
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been resolved", alert.Description)
|
|
||||||
alertProvider = &core.CustomAlertProvider{
|
|
||||||
Url: cfg.Alerting.Custom.Url,
|
|
||||||
Method: cfg.Alerting.Custom.Method,
|
|
||||||
Body: cfg.Alerting.Custom.Body,
|
|
||||||
Headers: cfg.Alerting.Custom.Headers,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being resolved, because the custom provider isn't configured properly")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if alertProvider != nil {
|
|
||||||
err := alertProvider.Send(service.Name, alert.Description, true)
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
service.NumberOfFailuresInARow = 0
|
|
||||||
} else {
|
|
||||||
service.NumberOfFailuresInARow++
|
|
||||||
for _, alert := range service.Alerts {
|
|
||||||
// If the alert hasn't been triggered, move to the next one
|
|
||||||
if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
var alertProvider *core.CustomAlertProvider
|
|
||||||
if alert.Type == core.SlackAlert {
|
|
||||||
if len(cfg.Alerting.Slack) > 0 {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been triggered", alert.Description)
|
|
||||||
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false)
|
|
||||||
} else {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
|
|
||||||
}
|
|
||||||
} else if alert.Type == core.TwilioAlert {
|
|
||||||
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been triggered", alert.Description)
|
|
||||||
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description))
|
|
||||||
} else {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being triggered, because Twilio config settings missing")
|
|
||||||
}
|
|
||||||
} else if alert.Type == core.CustomAlert {
|
|
||||||
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been triggered", alert.Description)
|
|
||||||
alertProvider = &core.CustomAlertProvider{
|
|
||||||
Url: cfg.Alerting.Custom.Url,
|
|
||||||
Method: cfg.Alerting.Custom.Method,
|
|
||||||
Body: cfg.Alerting.Custom.Body,
|
|
||||||
Headers: cfg.Alerting.Custom.Headers,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being triggered, because there is no custom url configured")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if alertProvider != nil {
|
|
||||||
err := alertProvider.Send(service.Name, alert.Description, false)
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user