totoro: tweak alertmanager setup

This commit is contained in:
Luke Granger-Brown 2023-01-14 22:24:01 +00:00
parent ed03e709c5
commit ff0eff593d

View file

@ -270,6 +270,7 @@ in {
services.prometheus = {
enable = true;
stateDir = "export/monitoring/prometheus";
webExternalUrl = "https://prometheus.int.lukegb.com";
alertmanagers = [{
scheme = "http";
static_configs = [{
@ -373,13 +374,13 @@ in {
# Packet loss
- alert: SmokepingAveragePacketLossHigh
expr: (avg((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) by (system)) >= 0.01
expr: sum(clamp((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m]) > 0.01, 1, 1)) by (system) > sum(clamp(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}, 1, 1)) by (system) * 0.4
for: 10m
labels:
severity: page
annotations:
summary: "Average packet loss from {{ $labels.system }} high"
description: "The average packet loss from {{ $labels.system }} is {{ $value | humanizePercentage }}%, which is too high."
description: "Too many endpoints are failing packet loss checks from {{ $labels.system }} ({{ $value }} targets)."
- alert: SmokepingPacketLossVeryHigh
expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10
for: 10m
@ -413,6 +414,7 @@ in {
alertmanager = {
enable = true;
webExternalUrl = "https://alertmanager.int.lukegb.com";
configuration = {
global = {};
route = {