94c059ccb9
- Secret grafana-telegram: bot token + chat ID (env var injection)
- ConfigMap grafana-alerting: provisioning files for contact point,
notification policy, and 4 alert rules
* Pod CrashLoopBackOff (for: 1m, noData: OK)
* Disk > 80% on non-tmpfs filesystems (for: 5m)
* RAM > 85% (for: 5m)
* Pod Failed/Unknown (for: 3m, noData: OK)
- Deployment: TELEGRAM_* env vars from secret + alerting volume mount
Token interpolated via ${TELEGRAM_BOT_TOKEN} in provisioning YAML.
179 lines
6.1 KiB
YAML
179 lines
6.1 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-alerting
|
|
namespace: monitoring
|
|
data:
|
|
contact-points.yaml: |
|
|
apiVersion: 1
|
|
contactPoints:
|
|
- orgId: 1
|
|
name: Telegram Homelab
|
|
receivers:
|
|
- uid: telegram-homelab
|
|
type: telegram
|
|
settings:
|
|
bottoken: "${TELEGRAM_BOT_TOKEN}"
|
|
chatid: "${TELEGRAM_CHAT_ID}"
|
|
parseMode: HTML
|
|
disableResolveMessage: false
|
|
|
|
notification-policy.yaml: |
|
|
apiVersion: 1
|
|
policies:
|
|
- orgId: 1
|
|
receiver: Telegram Homelab
|
|
group_by:
|
|
- alertname
|
|
- namespace
|
|
- pod
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
routes: []
|
|
|
|
alert-rules.yaml: |
|
|
apiVersion: 1
|
|
groups:
|
|
- orgId: 1
|
|
name: homelab-infra
|
|
folder: Homelab Alerts
|
|
interval: 1m
|
|
rules:
|
|
|
|
- uid: homelab-crashloop
|
|
title: "Pod CrashLoopBackOff"
|
|
condition: B
|
|
for: 1m
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})"
|
|
description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
|
|
instant: true
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: threshold
|
|
refId: B
|
|
conditions:
|
|
- type: query
|
|
evaluator: {params: [0], type: gt}
|
|
operator: {type: and}
|
|
query: {params: [A]}
|
|
reducer: {params: [], type: last}
|
|
|
|
- uid: homelab-disk-high
|
|
title: "Disco > 80%"
|
|
condition: B
|
|
for: 5m
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}"
|
|
description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
|
|
instant: true
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: threshold
|
|
refId: B
|
|
conditions:
|
|
- type: query
|
|
evaluator: {params: [0.8], type: gt}
|
|
operator: {type: and}
|
|
query: {params: [A]}
|
|
reducer: {params: [], type: last}
|
|
|
|
- uid: homelab-ram-high
|
|
title: "RAM > 85%"
|
|
condition: B
|
|
for: 5m
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "RAM alta: {{ $labels.instance }}"
|
|
description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
|
|
instant: true
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: threshold
|
|
refId: B
|
|
conditions:
|
|
- type: query
|
|
evaluator: {params: [0.85], type: gt}
|
|
operator: {type: and}
|
|
query: {params: [A]}
|
|
reducer: {params: [], type: last}
|
|
|
|
- uid: homelab-pod-failed
|
|
title: "Pod Failed/Unknown"
|
|
condition: B
|
|
for: 3m
|
|
noDataState: OK
|
|
execErrState: Error
|
|
annotations:
|
|
summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
|
|
labels:
|
|
severity: warning
|
|
isPaused: false
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: {from: 300, to: 0}
|
|
datasourceUid: prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}"
|
|
instant: true
|
|
refId: A
|
|
- refId: B
|
|
relativeTimeRange: {from: 0, to: 0}
|
|
datasourceUid: "-100"
|
|
model:
|
|
type: threshold
|
|
refId: B
|
|
conditions:
|
|
- type: query
|
|
evaluator: {params: [0], type: gt}
|
|
operator: {type: and}
|
|
query: {params: [A]}
|
|
reducer: {params: [], type: last}
|