feat(monitoring): Grafana alerting → Telegram for homelab
- Secret grafana-telegram: bot token + chat ID (env var injection)
- ConfigMap grafana-alerting: provisioning files for contact point,
notification policy, and 4 alert rules
* Pod CrashLoopBackOff (for: 1m, noData: OK)
* Disk > 80% on non-tmpfs filesystems (for: 5m)
* RAM > 85% (for: 5m)
* Pod Failed/Unknown (for: 3m, noData: OK)
- Deployment: TELEGRAM_* env vars from secret + alerting volume mount
Token interpolated via ${TELEGRAM_BOT_TOKEN} in provisioning YAML.
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-alerting
|
||||
namespace: monitoring
|
||||
data:
|
||||
contact-points.yaml: |
|
||||
apiVersion: 1
|
||||
contactPoints:
|
||||
- orgId: 1
|
||||
name: Telegram Homelab
|
||||
receivers:
|
||||
- uid: telegram-homelab
|
||||
type: telegram
|
||||
settings:
|
||||
bottoken: "${TELEGRAM_BOT_TOKEN}"
|
||||
chatid: "${TELEGRAM_CHAT_ID}"
|
||||
parseMode: HTML
|
||||
disableResolveMessage: false
|
||||
|
||||
notification-policy.yaml: |
|
||||
apiVersion: 1
|
||||
policies:
|
||||
- orgId: 1
|
||||
receiver: Telegram Homelab
|
||||
group_by:
|
||||
- alertname
|
||||
- namespace
|
||||
- pod
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
routes: []
|
||||
|
||||
alert-rules.yaml: |
|
||||
apiVersion: 1
|
||||
groups:
|
||||
- orgId: 1
|
||||
name: homelab-infra
|
||||
folder: Homelab Alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
|
||||
- uid: homelab-crashloop
|
||||
title: "Pod CrashLoopBackOff"
|
||||
condition: B
|
||||
for: 1m
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})"
|
||||
description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
|
||||
labels:
|
||||
severity: critical
|
||||
isPaused: false
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
|
||||
instant: true
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
refId: B
|
||||
conditions:
|
||||
- type: query
|
||||
evaluator: {params: [0], type: gt}
|
||||
operator: {type: and}
|
||||
query: {params: [A]}
|
||||
reducer: {params: [], type: last}
|
||||
|
||||
- uid: homelab-disk-high
|
||||
title: "Disco > 80%"
|
||||
condition: B
|
||||
for: 5m
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}"
|
||||
description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
|
||||
labels:
|
||||
severity: warning
|
||||
isPaused: false
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
|
||||
instant: true
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
refId: B
|
||||
conditions:
|
||||
- type: query
|
||||
evaluator: {params: [0.8], type: gt}
|
||||
operator: {type: and}
|
||||
query: {params: [A]}
|
||||
reducer: {params: [], type: last}
|
||||
|
||||
- uid: homelab-ram-high
|
||||
title: "RAM > 85%"
|
||||
condition: B
|
||||
for: 5m
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "RAM alta: {{ $labels.instance }}"
|
||||
description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
|
||||
labels:
|
||||
severity: warning
|
||||
isPaused: false
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
|
||||
instant: true
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
refId: B
|
||||
conditions:
|
||||
- type: query
|
||||
evaluator: {params: [0.85], type: gt}
|
||||
operator: {type: and}
|
||||
query: {params: [A]}
|
||||
reducer: {params: [], type: last}
|
||||
|
||||
- uid: homelab-pod-failed
|
||||
title: "Pod Failed/Unknown"
|
||||
condition: B
|
||||
for: 3m
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})"
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
|
||||
labels:
|
||||
severity: warning
|
||||
isPaused: false
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: {from: 300, to: 0}
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}"
|
||||
instant: true
|
||||
refId: A
|
||||
- refId: B
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
refId: B
|
||||
conditions:
|
||||
- type: query
|
||||
evaluator: {params: [0], type: gt}
|
||||
operator: {type: and}
|
||||
query: {params: [A]}
|
||||
reducer: {params: [], type: last}
|
||||
Reference in New Issue
Block a user