diff --git a/ops/nixos/totoro/default.nix b/ops/nixos/totoro/default.nix index 2cf4756126..7786e19f84 100644 --- a/ops/nixos/totoro/default.nix +++ b/ops/nixos/totoro/default.nix @@ -237,8 +237,10 @@ in { rules = [ '' groups: - - name: blade-oa + - name: alerting rules: + + # Blade power - alert: AveragePowerUsageTooHigh expr: (sum(avg_over_time(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}[10m])) / 230) > 6.5 labels: @@ -262,8 +264,8 @@ in { annotations: summary: "Blade: Single Blade Power Usage Out of Bounds" description: "{{ $labels.system }} has power usage of {{ $value }}, which is out of expected bounds." - - name: availability - rules: + + # Systems - alert: NodeExporterDown expr: up{exporter="node", system=~"(blade-(tuvok|paris|janeway|torres)|kusakabe|marukuru|swann|totoro|clouvider-.*|etheroute-.*)"} < 1 for: 30m @@ -272,8 +274,8 @@ in { annotations: summary: "Node exporter no longer scrapable" description: "{{ $labels.system }} is not reachable from totoro." - - name: nixos - rules: + + # Alert if the NixOS channels are broken - alert: NixOSChannelBad expr: hydra_job_failed{} == 1 for: 30m