feat(monitoring): Grafana alerting → Telegram for homelab

- Secret grafana-telegram: bot token + chat ID (env var injection) - ConfigMap grafana-alerting: provisioning files for contact point, notification policy, and 4 alert rules * Pod CrashLoopBackOff (for: 1m, noData: OK) * Disk > 80% on non-tmpfs filesystems (for: 5m) * RAM > 85% (for: 5m) * Pod Failed/Unknown (for: 3m, noData: OK) - Deployment: TELEGRAM_* env vars from secret + alerting volume mount Token interpolated via ${TELEGRAM_BOT_TOKEN} in provisioning YAML.
2026-04-26 15:25:07 +00:00
parent 5df2e9746a
commit 94c059ccb9
3 changed files with 203 additions and 0 deletions
@@ -0,0 +1,178 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-alerting
+  namespace: monitoring
+data:
+  contact-points.yaml: |
+    apiVersion: 1
+    contactPoints:
+      - orgId: 1
+        name: Telegram Homelab
+        receivers:
+          - uid: telegram-homelab
+            type: telegram
+            settings:
+              bottoken: "${TELEGRAM_BOT_TOKEN}"
+              chatid: "${TELEGRAM_CHAT_ID}"
+              parseMode: HTML
+            disableResolveMessage: false
+
+  notification-policy.yaml: |
+    apiVersion: 1
+    policies:
+      - orgId: 1
+        receiver: Telegram Homelab
+        group_by:
+          - alertname
+          - namespace
+          - pod
+        group_wait: 30s
+        group_interval: 5m
+        repeat_interval: 4h
+        routes: []
+
+  alert-rules.yaml: |
+    apiVersion: 1
+    groups:
+      - orgId: 1
+        name: homelab-infra
+        folder: Homelab Alerts
+        interval: 1m
+        rules:
+
+          - uid: homelab-crashloop
+            title: "Pod CrashLoopBackOff"
+            condition: B
+            for: 1m
+            noDataState: OK
+            execErrState: Error
+            annotations:
+              summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})"
+              description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
+            labels:
+              severity: critical
+            isPaused: false
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model:
+                  editorMode: code
+                  expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
+                  instant: true
+                  refId: A
+              - refId: B
+                relativeTimeRange: {from: 0, to: 0}
+                datasourceUid: "-100"
+                model:
+                  type: threshold
+                  refId: B
+                  conditions:
+                    - type: query
+                      evaluator: {params: [0], type: gt}
+                      operator: {type: and}
+                      query: {params: [A]}
+                      reducer: {params: [], type: last}
+
+          - uid: homelab-disk-high
+            title: "Disco > 80%"
+            condition: B
+            for: 5m
+            noDataState: NoData
+            execErrState: Error
+            annotations:
+              summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}"
+              description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
+            labels:
+              severity: warning
+            isPaused: false
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model:
+                  editorMode: code
+                  expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
+                  instant: true
+                  refId: A
+              - refId: B
+                relativeTimeRange: {from: 0, to: 0}
+                datasourceUid: "-100"
+                model:
+                  type: threshold
+                  refId: B
+                  conditions:
+                    - type: query
+                      evaluator: {params: [0.8], type: gt}
+                      operator: {type: and}
+                      query: {params: [A]}
+                      reducer: {params: [], type: last}
+
+          - uid: homelab-ram-high
+            title: "RAM > 85%"
+            condition: B
+            for: 5m
+            noDataState: NoData
+            execErrState: Error
+            annotations:
+              summary: "RAM alta: {{ $labels.instance }}"
+              description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
+            labels:
+              severity: warning
+            isPaused: false
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model:
+                  editorMode: code
+                  expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
+                  instant: true
+                  refId: A
+              - refId: B
+                relativeTimeRange: {from: 0, to: 0}
+                datasourceUid: "-100"
+                model:
+                  type: threshold
+                  refId: B
+                  conditions:
+                    - type: query
+                      evaluator: {params: [0.85], type: gt}
+                      operator: {type: and}
+                      query: {params: [A]}
+                      reducer: {params: [], type: last}
+
+          - uid: homelab-pod-failed
+            title: "Pod Failed/Unknown"
+            condition: B
+            for: 3m
+            noDataState: OK
+            execErrState: Error
+            annotations:
+              summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})"
+              description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
+            labels:
+              severity: warning
+            isPaused: false
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model:
+                  editorMode: code
+                  expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}"
+                  instant: true
+                  refId: A
+              - refId: B
+                relativeTimeRange: {from: 0, to: 0}
+                datasourceUid: "-100"
+                model:
+                  type: threshold
+                  refId: B
+                  conditions:
+                    - type: query
+                      evaluator: {params: [0], type: gt}
+                      operator: {type: and}
+                      query: {params: [A]}
+                      reducer: {params: [], type: last}