totoro: add power use alerts for blade system
This commit is contained in:
parent
238d3ad7a8
commit
dc2fb8e2b1
1 changed files with 30 additions and 1 deletions
|
@ -219,7 +219,36 @@ in {
|
||||||
|
|
||||||
pushgateway.enable = true;
|
pushgateway.enable = true;
|
||||||
|
|
||||||
rules = [];
|
rules = [
|
||||||
|
''
|
||||||
|
groups:
|
||||||
|
- name: blade-oa
|
||||||
|
rules:
|
||||||
|
- alert: AveragePowerUsageTooHigh
|
||||||
|
expr: (sum(avg_over_time(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}[10m])) / 230) > 6.5
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: "Blade: Power Usage Too High (rolling)"
|
||||||
|
description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power"
|
||||||
|
- alert: PowerUsageTooHigh
|
||||||
|
expr: (sum(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}) / 230) > 6.5
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: "Blade: Power Usage Too High"
|
||||||
|
description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power"
|
||||||
|
- alert: BladePowerUsageOutOfBounds
|
||||||
|
expr: node_hwmon_power_average_watt{system=~"blade-.*"} > on () group_left() (1.5 * quantile(0.5, node_hwmon_power_average_watt{system=~"blade-.*"}))
|
||||||
|
for: 60m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: "Blade: Single Blade Power Usage Out of Bounds"
|
||||||
|
description: "{{ $labels.system }} has power usage of {{ $value }}, which is out of expected bounds."
|
||||||
|
''
|
||||||
|
];
|
||||||
|
|
||||||
alertmanager = {
|
alertmanager = {
|
||||||
enable = true;
|
enable = true;
|
||||||
|
|
Loading…
Reference in a new issue