Merge pull request #10 from TwinProduction/notify-on-resolved
Support sending an alert when an unhealthy service becomes healthy again
This commit is contained in:
		
							
								
								
									
										
											BIN
										
									
								
								.github/assets/slack-alerts.png
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								.github/assets/slack-alerts.png
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 34 KiB | 
							
								
								
									
										36
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										36
									
								
								README.md
									
									
									
									
									
								
							| @ -24,7 +24,7 @@ core applications: https://status.twinnation.org/ | |||||||
|   - [Sending a GraphQL request](#sending-a-graphql-request) |   - [Sending a GraphQL request](#sending-a-graphql-request) | ||||||
|   - [Configuring Slack alerts](#configuring-slack-alerts) |   - [Configuring Slack alerts](#configuring-slack-alerts) | ||||||
|   - [Configuring Twilio alerts](#configuring-twilio-alerts) |   - [Configuring Twilio alerts](#configuring-twilio-alerts) | ||||||
|   - [Configuring custom alert](#configuring-custom-alerts) |   - [Configuring custom alerts](#configuring-custom-alerts) | ||||||
|  |  | ||||||
|  |  | ||||||
| ## Features | ## Features | ||||||
| @ -67,13 +67,14 @@ This example would look like this: | |||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| Note that you can also add environment variables in the your configuration file (i.e. `$DOMAIN`, `${DOMAIN}`) | Note that you can also add environment variables in the configuration file (i.e. `$DOMAIN`, `${DOMAIN}`) | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Configuration | ### Configuration | ||||||
|  |  | ||||||
| | Parameter                              | Description                                                     | Default        | | | Parameter                              | Description                                                     | Default        | | ||||||
| | --------------------------------- | --------------------------------------------------------------- | -------------- | | | -------------------------------------- | --------------------------------------------------------------- | -------------- | | ||||||
|  | | `debug`                                | Whether to enable debug logs                                    | `false`        | | ||||||
| | `metrics`                              | Whether to expose metrics at /metrics                           | `false`        | | | `metrics`                              | Whether to expose metrics at /metrics                           | `false`        | | ||||||
| | `services`                             | List of services to monitor                                     | Required `[]`  | | | `services`                             | List of services to monitor                                     | Required `[]`  | | ||||||
| | `services[].name`                      | Name of the service. Can be anything.                           | Required `""`  | | | `services[].name`                      | Name of the service. Can be anything.                           | Required `""`  | | ||||||
| @ -88,6 +89,7 @@ Note that you can also add environment variables in the your configuration file | |||||||
| | `services[].alerts[].enabled`          | Whether to enable the alert                                     | `false`        | | | `services[].alerts[].enabled`          | Whether to enable the alert                                     | `false`        | | ||||||
| | `services[].alerts[].threshold`        | Number of failures in a row needed before triggering the alert  | `3`            | | | `services[].alerts[].threshold`        | Number of failures in a row needed before triggering the alert  | `3`            | | ||||||
| | `services[].alerts[].description`      | Description of the alert. Will be included in the alert sent    | `""`           | | | `services[].alerts[].description`      | Description of the alert. Will be included in the alert sent    | `""`           | | ||||||
|  | | `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides  | `false`        | | ||||||
| | `alerting`                             | Configuration for alerting                                      | `{}`           | | | `alerting`                             | Configuration for alerting                                      | `{}`           | | ||||||
| | `alerting.slack`                       | Webhook to use for alerts of type `slack`                       | `""`           | | | `alerting.slack`                       | Webhook to use for alerts of type `slack`                       | `""`           | | ||||||
| | `alerting.twilio`                      | Settings for alerts of type `twilio`                            | `""`           | | | `alerting.twilio`                      | Settings for alerts of type `twilio`                            | `""`           | | ||||||
| @ -121,7 +123,7 @@ Here are some examples of conditions you can use: | |||||||
|  |  | ||||||
| ## Docker | ## Docker | ||||||
|  |  | ||||||
| Building the Docker image is done as following: | Building the Docker image is done as follows: | ||||||
|  |  | ||||||
| ``` | ``` | ||||||
| docker build . -t gatus | docker build . -t gatus | ||||||
| @ -194,33 +196,37 @@ services: | |||||||
|       - type: slack |       - type: slack | ||||||
|         enabled: true |         enabled: true | ||||||
|         description: "healthcheck failed 3 times in a row" |         description: "healthcheck failed 3 times in a row" | ||||||
|  |         send-on-resolved: true | ||||||
|       - type: slack |       - type: slack | ||||||
|         enabled: true |         enabled: true | ||||||
|         threshold: 5 |         threshold: 5 | ||||||
|         description: "healthcheck failed 5 times in a row" |         description: "healthcheck failed 5 times in a row" | ||||||
|  |         send-on-resolved: true | ||||||
|     conditions: |     conditions: | ||||||
|       - "[STATUS] == 200" |       - "[STATUS] == 200" | ||||||
|       - "[BODY].status == UP" |       - "[BODY].status == UP" | ||||||
|       - "[RESPONSE_TIME] < 300" |       - "[RESPONSE_TIME] < 300" | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | Here's an example of what the notifications look like: | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Configuring Twilio alerts | ### Configuring Twilio alerts | ||||||
|  |  | ||||||
| ```yaml | ```yaml | ||||||
| alerting: | alerting: | ||||||
|   twilio: |   twilio: | ||||||
|     sid: **** |     sid: "..." | ||||||
|     token: **** |     token: "..." | ||||||
|     from: +1-234-567-8901 |     from: "+1-234-567-8901" | ||||||
|     to: +1-234-567-8901 |     to: "+1-234-567-8901" | ||||||
| services: | services: | ||||||
|   - name: twinnation |   - name: twinnation | ||||||
|     interval: 30s |     interval: 30s | ||||||
|     url: "https://twinnation.org/health" |     url: "https://twinnation.org/health" | ||||||
|     alerts: |     alerts: | ||||||
|       - type: twilio |  | ||||||
|         enabled: true |  | ||||||
|         description: "healthcheck failed 3 times in a row" |  | ||||||
|       - type: twilio |       - type: twilio | ||||||
|         enabled: true |         enabled: true | ||||||
|         threshold: 5 |         threshold: 5 | ||||||
| @ -242,7 +248,10 @@ would then check if the service that started failing was recently deployed, and | |||||||
| roll it back. | roll it back. | ||||||
|  |  | ||||||
| The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the  | The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the  | ||||||
| service name respectively in the body (`alerting.custom.body`) and the url (`alerting.custom.url`). | service name respectively in the body (`alerting.custom.body`) as well as the url (`alerting.custom.url`). | ||||||
|  |  | ||||||
|  | If you have `send-on-resolved` set to `true`, you may want to use `[ALERT_TRIGGERED_OR_RESOLVED]` to differentiate | ||||||
|  | the notifications. It will be replaced for either `TRIGGERED` or `RESOLVED`, based on the situation. | ||||||
|  |  | ||||||
| For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want. | For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want. | ||||||
|  |  | ||||||
| @ -253,7 +262,7 @@ alerting: | |||||||
|     method: "POST" |     method: "POST" | ||||||
|     body: | |     body: | | ||||||
|       { |       { | ||||||
|         "text": "[SERVICE_NAME] - [ALERT_DESCRIPTION]" |         "text": "[ALERT_TRIGGERED_OR_RESOLVED]: [SERVICE_NAME] - [ALERT_DESCRIPTION]" | ||||||
|       } |       } | ||||||
| services: | services: | ||||||
|   - name: twinnation |   - name: twinnation | ||||||
| @ -263,6 +272,7 @@ services: | |||||||
|       - type: custom |       - type: custom | ||||||
|         enabled: true |         enabled: true | ||||||
|         threshold: 10 |         threshold: 10 | ||||||
|  |         send-on-resolved: true | ||||||
|         description: "healthcheck failed 10 times in a row" |         description: "healthcheck failed 10 times in a row" | ||||||
|     conditions: |     conditions: | ||||||
|       - "[STATUS] == 200" |       - "[STATUS] == 200" | ||||||
|  | |||||||
| @ -22,6 +22,7 @@ var ( | |||||||
|  |  | ||||||
| type Config struct { | type Config struct { | ||||||
| 	Metrics  bool                 `yaml:"metrics"` | 	Metrics  bool                 `yaml:"metrics"` | ||||||
|  | 	Debug    bool                 `yaml:"debug"` | ||||||
| 	Alerting *core.AlertingConfig `yaml:"alerting"` | 	Alerting *core.AlertingConfig `yaml:"alerting"` | ||||||
| 	Services []*core.Service      `yaml:"services"` | 	Services []*core.Service      `yaml:"services"` | ||||||
| } | } | ||||||
|  | |||||||
| @ -13,6 +13,9 @@ type Alert struct { | |||||||
|  |  | ||||||
| 	// Description of the alert. Will be included in the alert sent. | 	// Description of the alert. Will be included in the alert sent. | ||||||
| 	Description string `yaml:"description"` | 	Description string `yaml:"description"` | ||||||
|  |  | ||||||
|  | 	// SendOnResolved defines whether to send a second notification when the issue has been resolved | ||||||
|  | 	SendOnResolved bool `yaml:"send-on-resolved"` | ||||||
| } | } | ||||||
|  |  | ||||||
| type AlertType string | type AlertType string | ||||||
|  | |||||||
							
								
								
									
										111
									
								
								core/alerting.go
									
									
									
									
									
								
							
							
						
						
									
										111
									
								
								core/alerting.go
									
									
									
									
									
								
							| @ -2,9 +2,11 @@ package core | |||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"bytes" | 	"bytes" | ||||||
|  | 	"encoding/base64" | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"github.com/TwinProduction/gatus/client" | 	"github.com/TwinProduction/gatus/client" | ||||||
| 	"net/http" | 	"net/http" | ||||||
|  | 	"net/url" | ||||||
| 	"strings" | 	"strings" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| @ -21,6 +23,10 @@ type TwilioAlertProvider struct { | |||||||
| 	To    string `yaml:"to"` | 	To    string `yaml:"to"` | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func (provider *TwilioAlertProvider) IsValid() bool { | ||||||
|  | 	return len(provider.Token) > 0 && len(provider.SID) > 0 && len(provider.From) > 0 && len(provider.To) > 0 | ||||||
|  | } | ||||||
|  |  | ||||||
| type CustomAlertProvider struct { | type CustomAlertProvider struct { | ||||||
| 	Url     string            `yaml:"url"` | 	Url     string            `yaml:"url"` | ||||||
| 	Method  string            `yaml:"method,omitempty"` | 	Method  string            `yaml:"method,omitempty"` | ||||||
| @ -28,31 +34,49 @@ type CustomAlertProvider struct { | |||||||
| 	Headers map[string]string `yaml:"headers,omitempty"` | 	Headers map[string]string `yaml:"headers,omitempty"` | ||||||
| } | } | ||||||
|  |  | ||||||
| func (provider *CustomAlertProvider) buildRequest(serviceName, alertDescription string) *http.Request { | func (provider *CustomAlertProvider) IsValid() bool { | ||||||
|  | 	return len(provider.Url) > 0 | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (provider *CustomAlertProvider) buildRequest(serviceName, alertDescription string, resolved bool) *http.Request { | ||||||
| 	body := provider.Body | 	body := provider.Body | ||||||
| 	url := provider.Url | 	providerUrl := provider.Url | ||||||
| 	if strings.Contains(provider.Body, "[ALERT_DESCRIPTION]") { | 	if strings.Contains(body, "[ALERT_DESCRIPTION]") { | ||||||
| 		body = strings.ReplaceAll(provider.Body, "[ALERT_DESCRIPTION]", alertDescription) | 		body = strings.ReplaceAll(body, "[ALERT_DESCRIPTION]", alertDescription) | ||||||
| 	} | 	} | ||||||
| 	if strings.Contains(provider.Body, "[SERVICE_NAME]") { | 	if strings.Contains(body, "[SERVICE_NAME]") { | ||||||
| 		body = strings.ReplaceAll(provider.Body, "[SERVICE_NAME]", serviceName) | 		body = strings.ReplaceAll(body, "[SERVICE_NAME]", serviceName) | ||||||
| 	} | 	} | ||||||
| 	if strings.Contains(provider.Url, "[ALERT_DESCRIPTION]") { | 	if strings.Contains(body, "[ALERT_TRIGGERED_OR_RESOLVED]") { | ||||||
| 		url = strings.ReplaceAll(provider.Url, "[ALERT_DESCRIPTION]", alertDescription) | 		if resolved { | ||||||
|  | 			body = strings.ReplaceAll(body, "[ALERT_TRIGGERED_OR_RESOLVED]", "RESOLVED") | ||||||
|  | 		} else { | ||||||
|  | 			body = strings.ReplaceAll(body, "[ALERT_TRIGGERED_OR_RESOLVED]", "TRIGGERED") | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	if strings.Contains(providerUrl, "[ALERT_DESCRIPTION]") { | ||||||
|  | 		providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_DESCRIPTION]", alertDescription) | ||||||
|  | 	} | ||||||
|  | 	if strings.Contains(providerUrl, "[SERVICE_NAME]") { | ||||||
|  | 		providerUrl = strings.ReplaceAll(providerUrl, "[SERVICE_NAME]", serviceName) | ||||||
|  | 	} | ||||||
|  | 	if strings.Contains(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]") { | ||||||
|  | 		if resolved { | ||||||
|  | 			providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]", "RESOLVED") | ||||||
|  | 		} else { | ||||||
|  | 			providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]", "TRIGGERED") | ||||||
| 		} | 		} | ||||||
| 	if strings.Contains(provider.Url, "[SERVICE_NAME]") { |  | ||||||
| 		url = strings.ReplaceAll(provider.Url, "[SERVICE_NAME]", serviceName) |  | ||||||
| 	} | 	} | ||||||
| 	bodyBuffer := bytes.NewBuffer([]byte(body)) | 	bodyBuffer := bytes.NewBuffer([]byte(body)) | ||||||
| 	request, _ := http.NewRequest(provider.Method, url, bodyBuffer) | 	request, _ := http.NewRequest(provider.Method, providerUrl, bodyBuffer) | ||||||
| 	for k, v := range provider.Headers { | 	for k, v := range provider.Headers { | ||||||
| 		request.Header.Set(k, v) | 		request.Header.Set(k, v) | ||||||
| 	} | 	} | ||||||
| 	return request | 	return request | ||||||
| } | } | ||||||
|  |  | ||||||
| func (provider *CustomAlertProvider) Send(serviceName, alertDescription string) error { | func (provider *CustomAlertProvider) Send(serviceName, alertDescription string, resolved bool) error { | ||||||
| 	request := provider.buildRequest(serviceName, alertDescription) | 	request := provider.buildRequest(serviceName, alertDescription, resolved) | ||||||
| 	response, err := client.GetHttpClient().Do(request) | 	response, err := client.GetHttpClient().Do(request) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return err | 		return err | ||||||
| @ -62,3 +86,64 @@ func (provider *CustomAlertProvider) Send(serviceName, alertDescription string) | |||||||
| 	} | 	} | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider { | ||||||
|  | 	var message string | ||||||
|  | 	var color string | ||||||
|  | 	if resolved { | ||||||
|  | 		message = fmt.Sprintf("An alert for *%s* has been resolved after %d failures in a row", service.Name, service.NumberOfFailuresInARow) | ||||||
|  | 		color = "#36A64F" | ||||||
|  | 	} else { | ||||||
|  | 		message = fmt.Sprintf("An alert for *%s* has been triggered", service.Name) | ||||||
|  | 		color = "#DD0000" | ||||||
|  | 	} | ||||||
|  | 	var results string | ||||||
|  | 	for _, conditionResult := range result.ConditionResults { | ||||||
|  | 		var prefix string | ||||||
|  | 		if conditionResult.Success { | ||||||
|  | 			prefix = ":heavy_check_mark:" | ||||||
|  | 		} else { | ||||||
|  | 			prefix = ":x:" | ||||||
|  | 		} | ||||||
|  | 		results += fmt.Sprintf("%s - `%s`\n", prefix, conditionResult.Condition) | ||||||
|  | 	} | ||||||
|  | 	return &CustomAlertProvider{ | ||||||
|  | 		Url:    slackWebHookUrl, | ||||||
|  | 		Method: "POST", | ||||||
|  | 		Body: fmt.Sprintf(`{ | ||||||
|  |   "text": "", | ||||||
|  |   "attachments": [ | ||||||
|  |     { | ||||||
|  |       "title": ":helmet_with_white_cross: Gatus", | ||||||
|  |       "text": "%s:\n> %s", | ||||||
|  |       "short": false, | ||||||
|  |       "color": "%s", | ||||||
|  |       "fields": [ | ||||||
|  |         { | ||||||
|  |           "title": "Condition results", | ||||||
|  |           "value": "%s", | ||||||
|  |           "short": false | ||||||
|  |         } | ||||||
|  |       ] | ||||||
|  |     }, | ||||||
|  |   ] | ||||||
|  | }`, message, alert.Description, color, results), | ||||||
|  | 		Headers: map[string]string{"Content-Type": "application/json"}, | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func CreateTwilioCustomAlertProvider(provider *TwilioAlertProvider, message string) *CustomAlertProvider { | ||||||
|  | 	return &CustomAlertProvider{ | ||||||
|  | 		Url:    fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", provider.SID), | ||||||
|  | 		Method: "POST", | ||||||
|  | 		Body: url.Values{ | ||||||
|  | 			"To":   {provider.To}, | ||||||
|  | 			"From": {provider.From}, | ||||||
|  | 			"Body": {message}, | ||||||
|  | 		}.Encode(), | ||||||
|  | 		Headers: map[string]string{ | ||||||
|  | 			"Content-Type":  "application/x-www-form-urlencoded", | ||||||
|  | 			"Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", provider.SID, provider.Token)))), | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | |||||||
| @ -46,7 +46,7 @@ type Service struct { | |||||||
| 	// Alerts is the alerting configuration for the service in case of failure | 	// Alerts is the alerting configuration for the service in case of failure | ||||||
| 	Alerts []*Alert `yaml:"alerts"` | 	Alerts []*Alert `yaml:"alerts"` | ||||||
|  |  | ||||||
| 	numberOfFailuresInARow int | 	NumberOfFailuresInARow int | ||||||
| } | } | ||||||
|  |  | ||||||
| func (service *Service) Validate() { | func (service *Service) Validate() { | ||||||
| @ -94,22 +94,16 @@ func (service *Service) EvaluateConditions() *Result { | |||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	result.Timestamp = time.Now() | 	result.Timestamp = time.Now() | ||||||
| 	if result.Success { |  | ||||||
| 		service.numberOfFailuresInARow = 0 |  | ||||||
| 		// TODO: Send notification that alert has been resolved? |  | ||||||
| 	} else { |  | ||||||
| 		service.numberOfFailuresInARow++ |  | ||||||
| 	} |  | ||||||
| 	return result | 	return result | ||||||
| } | } | ||||||
|  |  | ||||||
| func (service *Service) GetAlertsTriggered() []Alert { | func (service *Service) GetAlertsTriggered() []Alert { | ||||||
| 	var alerts []Alert | 	var alerts []Alert | ||||||
| 	if service.numberOfFailuresInARow == 0 { | 	if service.NumberOfFailuresInARow == 0 { | ||||||
| 		return alerts | 		return alerts | ||||||
| 	} | 	} | ||||||
| 	for _, alert := range service.Alerts { | 	for _, alert := range service.Alerts { | ||||||
| 		if alert.Enabled && alert.Threshold == service.numberOfFailuresInARow { | 		if alert.Enabled && alert.Threshold == service.NumberOfFailuresInARow { | ||||||
| 			alerts = append(alerts, *alert) | 			alerts = append(alerts, *alert) | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								go.sum
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								go.sum
									
									
									
									
									
								
							| @ -18,6 +18,7 @@ github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/me | |||||||
| github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= | github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= | ||||||
| github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= | ||||||
| github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= | github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= | ||||||
|  | github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs= | ||||||
| github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= | ||||||
| github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= | ||||||
| github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= | ||||||
|  | |||||||
							
								
								
									
										8
									
								
								main.go
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								main.go
									
									
									
									
									
								
							| @ -3,7 +3,6 @@ package main | |||||||
| import ( | import ( | ||||||
| 	"bytes" | 	"bytes" | ||||||
| 	"compress/gzip" | 	"compress/gzip" | ||||||
| 	"encoding/json" |  | ||||||
| 	"github.com/TwinProduction/gatus/config" | 	"github.com/TwinProduction/gatus/config" | ||||||
| 	"github.com/TwinProduction/gatus/watchdog" | 	"github.com/TwinProduction/gatus/watchdog" | ||||||
| 	"github.com/prometheus/client_golang/prometheus/promhttp" | 	"github.com/prometheus/client_golang/prometheus/promhttp" | ||||||
| @ -53,12 +52,11 @@ func serviceResultsHandler(writer http.ResponseWriter, r *http.Request) { | |||||||
| 	if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > CacheTTL; isExpired { | 	if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > CacheTTL; isExpired { | ||||||
| 		buffer := &bytes.Buffer{} | 		buffer := &bytes.Buffer{} | ||||||
| 		gzipWriter := gzip.NewWriter(buffer) | 		gzipWriter := gzip.NewWriter(buffer) | ||||||
| 		serviceResults := watchdog.GetServiceResults() | 		data, err := watchdog.GetJsonEncodedServiceResults() | ||||||
| 		data, err := json.Marshal(serviceResults) |  | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			log.Printf("[main][serviceResultsHandler] Unable to marshall object to JSON: %s", err.Error()) | 			log.Printf("[main][serviceResultsHandler] Unable to marshal object to JSON: %s", err.Error()) | ||||||
| 			writer.WriteHeader(http.StatusInternalServerError) | 			writer.WriteHeader(http.StatusInternalServerError) | ||||||
| 			_, _ = writer.Write([]byte("Unable to marshall object to JSON")) | 			_, _ = writer.Write([]byte("Unable to marshal object to JSON")) | ||||||
| 			return | 			return | ||||||
| 		} | 		} | ||||||
| 		gzipWriter.Write(data) | 		gzipWriter.Write(data) | ||||||
|  | |||||||
| @ -1,25 +1,34 @@ | |||||||
| package watchdog | package watchdog | ||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"encoding/base64" | 	"encoding/json" | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"github.com/TwinProduction/gatus/config" | 	"github.com/TwinProduction/gatus/config" | ||||||
| 	"github.com/TwinProduction/gatus/core" | 	"github.com/TwinProduction/gatus/core" | ||||||
| 	"github.com/TwinProduction/gatus/metric" | 	"github.com/TwinProduction/gatus/metric" | ||||||
| 	"log" | 	"log" | ||||||
| 	"net/url" |  | ||||||
| 	"sync" | 	"sync" | ||||||
| 	"time" | 	"time" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| var ( | var ( | ||||||
| 	serviceResults = make(map[string][]*core.Result) | 	serviceResults = make(map[string][]*core.Result) | ||||||
| 	rwLock         sync.RWMutex |  | ||||||
|  | 	// serviceResultsMutex is used to prevent concurrent map access | ||||||
|  | 	serviceResultsMutex sync.RWMutex | ||||||
|  |  | ||||||
|  | 	// monitoringMutex is used to prevent multiple services from being evaluated at the same time. | ||||||
|  | 	// Without this, conditions using response time may become inaccurate. | ||||||
|  | 	monitoringMutex sync.Mutex | ||||||
| ) | ) | ||||||
|  |  | ||||||
| // GetServiceResults returns a list of the last 20 results for each services | // GetJsonEncodedServiceResults returns a list of the last 20 results for each services encoded using json.Marshal. | ||||||
| func GetServiceResults() *map[string][]*core.Result { | // The reason why the encoding is done here is because we use a mutex to prevent concurrent map access. | ||||||
| 	return &serviceResults | func GetJsonEncodedServiceResults() ([]byte, error) { | ||||||
|  | 	serviceResultsMutex.RLock() | ||||||
|  | 	data, err := json.Marshal(serviceResults) | ||||||
|  | 	serviceResultsMutex.RUnlock() | ||||||
|  | 	return data, err | ||||||
| } | } | ||||||
|  |  | ||||||
| // Monitor loops over each services and starts a goroutine to monitor each services separately | // Monitor loops over each services and starts a goroutine to monitor each services separately | ||||||
| @ -33,71 +42,72 @@ func Monitor(cfg *config.Config) { | |||||||
|  |  | ||||||
| // monitor monitors a single service in a loop | // monitor monitors a single service in a loop | ||||||
| func monitor(service *core.Service) { | func monitor(service *core.Service) { | ||||||
|  | 	cfg := config.Get() | ||||||
| 	for { | 	for { | ||||||
| 		// By placing the lock here, we prevent multiple services from being monitored at the exact same time, which | 		// By placing the lock here, we prevent multiple services from being monitored at the exact same time, which | ||||||
| 		// could cause performance issues and return inaccurate results | 		// could cause performance issues and return inaccurate results | ||||||
| 		rwLock.Lock() | 		monitoringMutex.Lock() | ||||||
|  | 		if cfg.Debug { | ||||||
| 			log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name) | 			log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name) | ||||||
|  | 		} | ||||||
| 		result := service.EvaluateConditions() | 		result := service.EvaluateConditions() | ||||||
| 		metric.PublishMetricsForService(service, result) | 		metric.PublishMetricsForService(service, result) | ||||||
|  | 		serviceResultsMutex.Lock() | ||||||
| 		serviceResults[service.Name] = append(serviceResults[service.Name], result) | 		serviceResults[service.Name] = append(serviceResults[service.Name], result) | ||||||
| 		if len(serviceResults[service.Name]) > 20 { | 		if len(serviceResults[service.Name]) > 20 { | ||||||
| 			serviceResults[service.Name] = serviceResults[service.Name][1:] | 			serviceResults[service.Name] = serviceResults[service.Name][1:] | ||||||
| 		} | 		} | ||||||
| 		rwLock.Unlock() | 		serviceResultsMutex.Unlock() | ||||||
| 		var extra string | 		var extra string | ||||||
| 		if !result.Success { | 		if !result.Success { | ||||||
| 			extra = fmt.Sprintf("responseBody=%s", result.Body) | 			extra = fmt.Sprintf("responseBody=%s", result.Body) | ||||||
| 		} | 		} | ||||||
| 		log.Printf( | 		log.Printf( | ||||||
| 			"[watchdog][monitor] Finished monitoring serviceName=%s; errors=%d; requestDuration=%s; %s", | 			"[watchdog][monitor] Monitored serviceName=%s; success=%v; errors=%d; requestDuration=%s; %s", | ||||||
| 			service.Name, | 			service.Name, | ||||||
|  | 			result.Success, | ||||||
| 			len(result.Errors), | 			len(result.Errors), | ||||||
| 			result.Duration.Round(time.Millisecond), | 			result.Duration.Round(time.Millisecond), | ||||||
| 			extra, | 			extra, | ||||||
| 		) | 		) | ||||||
|  | 		handleAlerting(service, result) | ||||||
|  | 		if cfg.Debug { | ||||||
|  | 			log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name) | ||||||
|  | 		} | ||||||
|  | 		monitoringMutex.Unlock() | ||||||
|  | 		time.Sleep(service.Interval) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func handleAlerting(service *core.Service, result *core.Result) { | ||||||
| 	cfg := config.Get() | 	cfg := config.Get() | ||||||
| 		if cfg.Alerting != nil { | 	if cfg.Alerting == nil { | ||||||
| 			for _, alertTriggered := range service.GetAlertsTriggered() { | 		return | ||||||
|  | 	} | ||||||
|  | 	if result.Success { | ||||||
|  | 		if service.NumberOfFailuresInARow > 0 { | ||||||
|  | 			for _, alert := range service.Alerts { | ||||||
|  | 				if !alert.Enabled || !alert.SendOnResolved || alert.Threshold > service.NumberOfFailuresInARow { | ||||||
|  | 					continue | ||||||
|  | 				} | ||||||
| 				var alertProvider *core.CustomAlertProvider | 				var alertProvider *core.CustomAlertProvider | ||||||
| 				if alertTriggered.Type == core.SlackAlert { | 				if alert.Type == core.SlackAlert { | ||||||
| 					if len(cfg.Alerting.Slack) > 0 { | 					if len(cfg.Alerting.Slack) > 0 { | ||||||
| 						log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been triggered", alertTriggered.Description) | 						log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been resolved", alert.Description) | ||||||
| 						alertProvider = &core.CustomAlertProvider{ | 						alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true) | ||||||
| 							Url:     cfg.Alerting.Slack, |  | ||||||
| 							Method:  "POST", |  | ||||||
| 							Body:    fmt.Sprintf(`{"text":"*[Gatus]*\n*service:* %s\n*description:* %s"}`, service.Name, alertTriggered.Description), |  | ||||||
| 							Headers: map[string]string{"Content-Type": "application/json"}, |  | ||||||
| 						} |  | ||||||
| 					} else { | 					} else { | ||||||
| 						log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") | 						log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") | ||||||
| 					} |  | ||||||
| 				} else if alertTriggered.Type == core.TwilioAlert { |  | ||||||
| 					if len(cfg.Alerting.Twilio.Token) > 0 && |  | ||||||
| 						len(cfg.Alerting.Twilio.SID) > 0 && |  | ||||||
| 						len(cfg.Alerting.Twilio.From) > 0 && |  | ||||||
| 						len(cfg.Alerting.Twilio.To) > 0 { |  | ||||||
| 						log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alertTriggered.Description) |  | ||||||
| 						alertProvider = &core.CustomAlertProvider{ |  | ||||||
| 							Url:    fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", cfg.Alerting.Twilio.SID), |  | ||||||
| 							Method: "POST", |  | ||||||
| 							Body: url.Values{ |  | ||||||
| 								"To":   {cfg.Alerting.Twilio.To}, |  | ||||||
| 								"From": {cfg.Alerting.Twilio.From}, |  | ||||||
| 								"Body": {fmt.Sprintf("%s - %s", service.Name, alertTriggered.Description)}, |  | ||||||
| 							}.Encode(), |  | ||||||
| 							Headers: map[string]string{ |  | ||||||
| 								"Content-Type":  "application/x-www-form-urlencoded", |  | ||||||
| 								"Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", cfg.Alerting.Twilio.SID, cfg.Alerting.Twilio.Token)))), |  | ||||||
| 							}, |  | ||||||
| 					} | 					} | ||||||
|  | 				} else if alert.Type == core.TwilioAlert { | ||||||
|  | 					if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() { | ||||||
|  | 						log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been resolved", alert.Description) | ||||||
|  | 						alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description)) | ||||||
| 					} else { | 					} else { | ||||||
| 						log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because twilio config settings missing") | 						log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being resolved, because Twilio isn't configured properly") | ||||||
| 					} | 					} | ||||||
| 				} else if alertTriggered.Type == core.CustomAlert { | 				} else if alert.Type == core.CustomAlert { | ||||||
| 					if cfg.Alerting.Custom != nil && len(cfg.Alerting.Custom.Url) > 0 { | 					if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() { | ||||||
| 						log.Printf("[watchdog][monitor] Sending custom alert because alert with description=%s has been triggered", alertTriggered.Description) | 						log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been resolved", alert.Description) | ||||||
| 						alertProvider = &core.CustomAlertProvider{ | 						alertProvider = &core.CustomAlertProvider{ | ||||||
| 							Url:     cfg.Alerting.Custom.Url, | 							Url:     cfg.Alerting.Custom.Url, | ||||||
| 							Method:  cfg.Alerting.Custom.Method, | 							Method:  cfg.Alerting.Custom.Method, | ||||||
| @ -105,19 +115,59 @@ func monitor(service *core.Service) { | |||||||
| 							Headers: cfg.Alerting.Custom.Headers, | 							Headers: cfg.Alerting.Custom.Headers, | ||||||
| 						} | 						} | ||||||
| 					} else { | 					} else { | ||||||
| 						log.Printf("[watchdog][monitor] Not sending custom alert despite being triggered, because there is no custom url configured") | 						log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being resolved, because the custom provider isn't configured properly") | ||||||
| 					} | 					} | ||||||
| 				} | 				} | ||||||
| 				if alertProvider != nil { | 				if alertProvider != nil { | ||||||
| 					err := alertProvider.Send(service.Name, alertTriggered.Description) | 					err := alertProvider.Send(service.Name, alert.Description, true) | ||||||
| 					if err != nil { | 					if err != nil { | ||||||
| 						log.Printf("[watchdog][monitor] Ran into error sending an alert: %s", err.Error()) | 						log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error()) | ||||||
| 					} | 					} | ||||||
| 				} | 				} | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  | 		service.NumberOfFailuresInARow = 0 | ||||||
| 		log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s", service.Interval, service.Name) | 	} else { | ||||||
| 		time.Sleep(service.Interval) | 		service.NumberOfFailuresInARow++ | ||||||
|  | 		for _, alert := range service.Alerts { | ||||||
|  | 			// If the alert hasn't been triggered, move to the next one | ||||||
|  | 			if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow { | ||||||
|  | 				continue | ||||||
|  | 			} | ||||||
|  | 			var alertProvider *core.CustomAlertProvider | ||||||
|  | 			if alert.Type == core.SlackAlert { | ||||||
|  | 				if len(cfg.Alerting.Slack) > 0 { | ||||||
|  | 					log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been triggered", alert.Description) | ||||||
|  | 					alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false) | ||||||
|  | 				} else { | ||||||
|  | 					log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") | ||||||
|  | 				} | ||||||
|  | 			} else if alert.Type == core.TwilioAlert { | ||||||
|  | 				if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() { | ||||||
|  | 					log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been triggered", alert.Description) | ||||||
|  | 					alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description)) | ||||||
|  | 				} else { | ||||||
|  | 					log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being triggered, because Twilio config settings missing") | ||||||
|  | 				} | ||||||
|  | 			} else if alert.Type == core.CustomAlert { | ||||||
|  | 				if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() { | ||||||
|  | 					log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been triggered", alert.Description) | ||||||
|  | 					alertProvider = &core.CustomAlertProvider{ | ||||||
|  | 						Url:     cfg.Alerting.Custom.Url, | ||||||
|  | 						Method:  cfg.Alerting.Custom.Method, | ||||||
|  | 						Body:    cfg.Alerting.Custom.Body, | ||||||
|  | 						Headers: cfg.Alerting.Custom.Headers, | ||||||
|  | 					} | ||||||
|  | 				} else { | ||||||
|  | 					log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being triggered, because there is no custom url configured") | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			if alertProvider != nil { | ||||||
|  | 				err := alertProvider.Send(service.Name, alert.Description, false) | ||||||
|  | 				if err != nil { | ||||||
|  | 					log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error()) | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user