totoro: add power use alerts for blade system
This commit is contained in:
parent
238d3ad7a8
commit
dc2fb8e2b1
1 changed files with 30 additions and 1 deletions
|
@ -219,7 +219,36 @@ in {
|
|||
|
||||
pushgateway.enable = true;
|
||||
|
||||
rules = [];
|
||||
rules = [
|
||||
''
|
||||
groups:
|
||||
- name: blade-oa
|
||||
rules:
|
||||
- alert: AveragePowerUsageTooHigh
|
||||
expr: (sum(avg_over_time(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}[10m])) / 230) > 6.5
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Blade: Power Usage Too High (rolling)"
|
||||
description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power"
|
||||
- alert: PowerUsageTooHigh
|
||||
expr: (sum(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}) / 230) > 6.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Blade: Power Usage Too High"
|
||||
description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power"
|
||||
- alert: BladePowerUsageOutOfBounds
|
||||
expr: node_hwmon_power_average_watt{system=~"blade-.*"} > on () group_left() (1.5 * quantile(0.5, node_hwmon_power_average_watt{system=~"blade-.*"}))
|
||||
for: 60m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: "Blade: Single Blade Power Usage Out of Bounds"
|
||||
description: "{{ $labels.system }} has power usage of {{ $value }}, which is out of expected bounds."
|
||||
''
|
||||
];
|
||||
|
||||
alertmanager = {
|
||||
enable = true;
|
||||
|
|
Loading…
Reference in a new issue