From dc2fb8e2b1f50009aba5b73d64f5154762e2caaa Mon Sep 17 00:00:00 2001 From: Luke Granger-Brown Date: Mon, 15 Mar 2021 01:20:26 +0000 Subject: [PATCH] totoro: add power use alerts for blade system --- ops/nixos/totoro/default.nix | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/ops/nixos/totoro/default.nix b/ops/nixos/totoro/default.nix index 5c244ed27c..eb56951ddb 100644 --- a/ops/nixos/totoro/default.nix +++ b/ops/nixos/totoro/default.nix @@ -219,7 +219,36 @@ in { pushgateway.enable = true; - rules = []; + rules = [ + '' + groups: + - name: blade-oa + rules: + - alert: AveragePowerUsageTooHigh + expr: (sum(avg_over_time(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}[10m])) / 230) > 6.5 + labels: + severity: page + annotations: + summary: "Blade: Power Usage Too High (rolling)" + description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power" + - alert: PowerUsageTooHigh + expr: (sum(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}) / 230) > 6.5 + for: 10m + labels: + severity: page + annotations: + summary: "Blade: Power Usage Too High" + description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power" + - alert: BladePowerUsageOutOfBounds + expr: node_hwmon_power_average_watt{system=~"blade-.*"} > on () group_left() (1.5 * quantile(0.5, node_hwmon_power_average_watt{system=~"blade-.*"})) + for: 60m + labels: + severity: page + annotations: + summary: "Blade: Single Blade Power Usage Out of Bounds" + description: "{{ $labels.system }} has power usage of {{ $value }}, which is out of expected bounds." + '' + ]; alertmanager = { enable = true;