4facdd8515
Grafana threshold expression requires a scalar input, not a raw time series. Added explicit reduce step (type: reduce, reducer: last) as refId B between the Prometheus query (A) and the threshold check (C). All 4 rules updated: CrashLoopBackOff, Disco >80%, RAM >85%, Pod Failed. condition field changed from B → C on each rule.
207 lines
6.7 KiB
YAML
207 lines
6.7 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-alerting
|
|
namespace: monitoring
|
|
data:
|
|
contact-points.yaml: |
|
|
apiVersion: 1
|
|
contactPoints:
|
|
- orgId: 1
|
|
name: Telegram Homelab
|
|
receivers:
|
|
- uid: telegram-homelab
|
|
type: telegram
|
|
settings:
|
|
bottoken: "${TELEGRAM_BOT_TOKEN}"
|
|
chatid: "5138407666"
|
|
parseMode: HTML
|
|
disableResolveMessage: false
|
|
|
|
notification-policy.yaml: |
|
|
apiVersion: 1
|
|
policies:
|
|
- orgId: 1
|
|
receiver: Telegram Homelab
|
|
group_by:
|
|
- alertname
|
|
- namespace
|
|
- pod
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
routes: []
|
|
|
|
alert-rules.yaml: |
|
|
apiVersion: 1
|
|
groups:
|
|
- orgId: 1
|
|
name: homelab-infra
|
|
folder: Homelab Alerts
|
|
interval: 1m
|
|
rules:
|
|
|
|
- uid: homelab-crashloop
|
|
title: "Pod CrashLoopBackOff"
|
|
condition: C
|
|
for: 1m
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})"
|
|
description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
|
|
instant: true
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: reduce
|
|
refId: B
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
- refId: C
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: threshold
|
|
refId: C
|
|
expression: B
|
|
conditions:
|
|
- evaluator: {params: [0], type: gt}
|
|
|
|
- uid: homelab-disk-high
|
|
title: "Disco > 80%"
|
|
condition: C
|
|
for: 5m
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}"
|
|
description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
|
|
instant: true
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: reduce
|
|
refId: B
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
- refId: C
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: threshold
|
|
refId: C
|
|
expression: B
|
|
conditions:
|
|
- evaluator: {params: [0.8], type: gt}
|
|
|
|
- uid: homelab-ram-high
|
|
title: "RAM > 85%"
|
|
condition: C
|
|
for: 5m
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "RAM alta: {{ $labels.instance }}"
|
|
description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
|
|
instant: true
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: reduce
|
|
refId: B
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
- refId: C
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: threshold
|
|
refId: C
|
|
expression: B
|
|
conditions:
|
|
- evaluator: {params: [0.85], type: gt}
|
|
|
|
- uid: homelab-pod-failed
|
|
title: "Pod Failed/Unknown"
|
|
condition: C
|
|
for: 3m
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}"
|
|
instant: true
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: reduce
|
|
refId: B
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
- refId: C
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: threshold
|
|
refId: C
|
|
expression: B
|
|
conditions:
|
|
- evaluator: {params: [0], type: gt}
|