totoro: tweak alertmanager setup

This commit is contained in:
Luke Granger-Brown 2023-01-14 22:24:01 +00:00
parent ed03e709c5
commit ff0eff593d

View file

@ -270,6 +270,7 @@ in {
services.prometheus = { services.prometheus = {
enable = true; enable = true;
stateDir = "export/monitoring/prometheus"; stateDir = "export/monitoring/prometheus";
webExternalUrl = "https://prometheus.int.lukegb.com";
alertmanagers = [{ alertmanagers = [{
scheme = "http"; scheme = "http";
static_configs = [{ static_configs = [{
@ -373,13 +374,13 @@ in {
# Packet loss # Packet loss
- alert: SmokepingAveragePacketLossHigh - alert: SmokepingAveragePacketLossHigh
expr: (avg((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) by (system)) >= 0.01 expr: sum(clamp((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m]) > 0.01, 1, 1)) by (system) > sum(clamp(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}, 1, 1)) by (system) * 0.4
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: "Average packet loss from {{ $labels.system }} high" summary: "Average packet loss from {{ $labels.system }} high"
description: "The average packet loss from {{ $labels.system }} is {{ $value | humanizePercentage }}%, which is too high." description: "Too many endpoints are failing packet loss checks from {{ $labels.system }} ({{ $value }} targets)."
- alert: SmokepingPacketLossVeryHigh - alert: SmokepingPacketLossVeryHigh
expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10 expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10
for: 10m for: 10m
@ -413,6 +414,7 @@ in {
alertmanager = { alertmanager = {
enable = true; enable = true;
webExternalUrl = "https://alertmanager.int.lukegb.com";
configuration = { configuration = {
global = {}; global = {};
route = { route = {