totoro: add power use alerts for blade system

This commit is contained in:
Luke Granger-Brown 2021-03-15 01:20:26 +00:00
parent 238d3ad7a8
commit dc2fb8e2b1

View file

@ -219,7 +219,36 @@ in {
pushgateway.enable = true; pushgateway.enable = true;
rules = []; rules = [
''
groups:
- name: blade-oa
rules:
- alert: AveragePowerUsageTooHigh
expr: (sum(avg_over_time(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}[10m])) / 230) > 6.5
labels:
severity: page
annotations:
summary: "Blade: Power Usage Too High (rolling)"
description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power"
- alert: PowerUsageTooHigh
expr: (sum(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}) / 230) > 6.5
for: 10m
labels:
severity: page
annotations:
summary: "Blade: Power Usage Too High"
description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power"
- alert: BladePowerUsageOutOfBounds
expr: node_hwmon_power_average_watt{system=~"blade-.*"} > on () group_left() (1.5 * quantile(0.5, node_hwmon_power_average_watt{system=~"blade-.*"}))
for: 60m
labels:
severity: page
annotations:
summary: "Blade: Single Blade Power Usage Out of Bounds"
description: "{{ $labels.system }} has power usage of {{ $value }}, which is out of expected bounds."
''
];
alertmanager = { alertmanager = {
enable = true; enable = true;