diff --git a/ops/nixos/totoro/default.nix b/ops/nixos/totoro/default.nix index 6978d3e484..8ab8edb28c 100644 --- a/ops/nixos/totoro/default.nix +++ b/ops/nixos/totoro/default.nix @@ -291,6 +291,34 @@ in { annotations: summary: "NixOS Channel {{ $labels.channel }} failing" description: "The channel {{ $labels.channel }} is failing - see https://hydra.nixos.org/job/{{ $labels.project }}/{{ $labels.jobset }}/tested" + + # Packet loss + - alert: SmokepingAveragePacketLossHigh + expr: (avg((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) by (system)) >= 0.01 + for: 10m + labels: + severity: page + annotations: + summary: "Average packet loss from {{ $labels.system }} high" + description: "The average packet loss from {{ $labels.system }} is {{ $value | humanize }}%, which is too high." + - alert: SmokepingPacketLossVeryHigh + expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10 + for: 10m + labels: + severity: page + annotations: + summary: "Packet loss to {{ $labels.host }} from {{ $labels.system }} high" + description: "The packet loss from {{ $labels.system }} to {{ $labels.host }} (IP: {{ $labels.ip }}) is very high ({{ $value | humanize }}%)." + + # Ping latency + - alert: Smokeping95LatencyHigh + expr: histogram_quantile(0.95, sum(rate(smokeping_response_duration_seconds_bucket{host=~"^(1.1.1.1|8.8.8.8)$"}[5m])) by (le, host, system)) > 0.03 + for: 15m + labels: + severity: page + annotations: + summary: "Ping latency from {{ $labels.system }} to {{ $labels.host }} high" + description: "The 95th-percentile ping latency from {{ $labels.system }} to {{ $labels.host }} is {{ $value }}." '' ];