totoro: tweak alertmanager setup
This commit is contained in:
parent
ed03e709c5
commit
ff0eff593d
1 changed files with 4 additions and 2 deletions
|
@ -270,6 +270,7 @@ in {
|
||||||
services.prometheus = {
|
services.prometheus = {
|
||||||
enable = true;
|
enable = true;
|
||||||
stateDir = "export/monitoring/prometheus";
|
stateDir = "export/monitoring/prometheus";
|
||||||
|
webExternalUrl = "https://prometheus.int.lukegb.com";
|
||||||
alertmanagers = [{
|
alertmanagers = [{
|
||||||
scheme = "http";
|
scheme = "http";
|
||||||
static_configs = [{
|
static_configs = [{
|
||||||
|
@ -373,13 +374,13 @@ in {
|
||||||
|
|
||||||
# Packet loss
|
# Packet loss
|
||||||
- alert: SmokepingAveragePacketLossHigh
|
- alert: SmokepingAveragePacketLossHigh
|
||||||
expr: (avg((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) by (system)) >= 0.01
|
expr: sum(clamp((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m]) > 0.01, 1, 1)) by (system) > sum(clamp(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}, 1, 1)) by (system) * 0.4
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Average packet loss from {{ $labels.system }} high"
|
summary: "Average packet loss from {{ $labels.system }} high"
|
||||||
description: "The average packet loss from {{ $labels.system }} is {{ $value | humanizePercentage }}%, which is too high."
|
description: "Too many endpoints are failing packet loss checks from {{ $labels.system }} ({{ $value }} targets)."
|
||||||
- alert: SmokepingPacketLossVeryHigh
|
- alert: SmokepingPacketLossVeryHigh
|
||||||
expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10
|
expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10
|
||||||
for: 10m
|
for: 10m
|
||||||
|
@ -413,6 +414,7 @@ in {
|
||||||
|
|
||||||
alertmanager = {
|
alertmanager = {
|
||||||
enable = true;
|
enable = true;
|
||||||
|
webExternalUrl = "https://alertmanager.int.lukegb.com";
|
||||||
configuration = {
|
configuration = {
|
||||||
global = {};
|
global = {};
|
||||||
route = {
|
route = {
|
||||||
|
|
Loading…
Reference in a new issue