diff --git a/ops/nixos/totoro/default.nix b/ops/nixos/totoro/default.nix index 58e9306509..df3bdf109f 100644 --- a/ops/nixos/totoro/default.nix +++ b/ops/nixos/totoro/default.nix @@ -270,6 +270,7 @@ in { services.prometheus = { enable = true; stateDir = "export/monitoring/prometheus"; + webExternalUrl = "https://prometheus.int.lukegb.com"; alertmanagers = [{ scheme = "http"; static_configs = [{ @@ -373,13 +374,13 @@ in { # Packet loss - alert: SmokepingAveragePacketLossHigh - expr: (avg((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) by (system)) >= 0.01 + expr: sum(clamp((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m]) > 0.01, 1, 1)) by (system) > sum(clamp(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}, 1, 1)) by (system) * 0.4 for: 10m labels: severity: page annotations: summary: "Average packet loss from {{ $labels.system }} high" - description: "The average packet loss from {{ $labels.system }} is {{ $value | humanizePercentage }}%, which is too high." + description: "Too many endpoints are failing packet loss checks from {{ $labels.system }} ({{ $value }} targets)." - alert: SmokepingPacketLossVeryHigh expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10 for: 10m @@ -413,6 +414,7 @@ in { alertmanager = { enable = true; + webExternalUrl = "https://alertmanager.int.lukegb.com"; configuration = { global = {}; route = {