From 94c059ccb90efac58ddff2bac0b11210ee0a68ee Mon Sep 17 00:00:00 2001
From: chemavx <bot@chemavx.xyz>
Date: Sun, 26 Apr 2026 15:25:07 +0000
Subject: [PATCH] =?UTF-8?q?feat(monitoring):=20Grafana=20alerting=20?=
 =?UTF-8?q?=E2=86=92=20Telegram=20for=20homelab?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Secret grafana-telegram: bot token + chat ID (env var injection)
- ConfigMap grafana-alerting: provisioning files for contact point,
  notification policy, and 4 alert rules
  * Pod CrashLoopBackOff (for: 1m, noData: OK)
  * Disk > 80% on non-tmpfs filesystems (for: 5m)
  * RAM > 85% (for: 5m)
  * Pod Failed/Unknown (for: 3m, noData: OK)
- Deployment: TELEGRAM_* env vars from secret + alerting volume mount

Token interpolated via ${TELEGRAM_BOT_TOKEN} in provisioning YAML.
---
 monitoring/configmap-grafana-alerting.yaml    | 178 ++++++++++++++++++
 ...loyment-kube-prometheus-stack-grafana.yaml |  16 ++
 monitoring/secret-grafana-telegram.yaml       |   9 +
 3 files changed, 203 insertions(+)
 create mode 100644 monitoring/configmap-grafana-alerting.yaml
 create mode 100644 monitoring/secret-grafana-telegram.yaml

diff --git a/monitoring/configmap-grafana-alerting.yaml b/monitoring/configmap-grafana-alerting.yaml
new file mode 100644
index 0000000..b80c6f9
--- /dev/null
+++ b/monitoring/configmap-grafana-alerting.yaml
@@ -0,0 +1,178 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-alerting
+  namespace: monitoring
+data:
+  contact-points.yaml: |
+    apiVersion: 1
+    contactPoints:
+      - orgId: 1
+        name: Telegram Homelab
+        receivers:
+          - uid: telegram-homelab
+            type: telegram
+            settings:
+              bottoken: "${TELEGRAM_BOT_TOKEN}"
+              chatid: "${TELEGRAM_CHAT_ID}"
+              parseMode: HTML
+            disableResolveMessage: false
+
+  notification-policy.yaml: |
+    apiVersion: 1
+    policies:
+      - orgId: 1
+        receiver: Telegram Homelab
+        group_by:
+          - alertname
+          - namespace
+          - pod
+        group_wait: 30s
+        group_interval: 5m
+        repeat_interval: 4h
+        routes: []
+
+  alert-rules.yaml: |
+    apiVersion: 1
+    groups:
+      - orgId: 1
+        name: homelab-infra
+        folder: Homelab Alerts
+        interval: 1m
+        rules:
+
+          - uid: homelab-crashloop
+            title: "Pod CrashLoopBackOff"
+            condition: B
+            for: 1m
+            noDataState: OK
+            execErrState: Error
+            annotations:
+              summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})"
+              description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
+            labels:
+              severity: critical
+            isPaused: false
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model:
+                  editorMode: code
+                  expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
+                  instant: true
+                  refId: A
+              - refId: B
+                relativeTimeRange: {from: 0, to: 0}
+                datasourceUid: "-100"
+                model:
+                  type: threshold
+                  refId: B
+                  conditions:
+                    - type: query
+                      evaluator: {params: [0], type: gt}
+                      operator: {type: and}
+                      query: {params: [A]}
+                      reducer: {params: [], type: last}
+
+          - uid: homelab-disk-high
+            title: "Disco > 80%"
+            condition: B
+            for: 5m
+            noDataState: NoData
+            execErrState: Error
+            annotations:
+              summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}"
+              description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
+            labels:
+              severity: warning
+            isPaused: false
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model:
+                  editorMode: code
+                  expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
+                  instant: true
+                  refId: A
+              - refId: B
+                relativeTimeRange: {from: 0, to: 0}
+                datasourceUid: "-100"
+                model:
+                  type: threshold
+                  refId: B
+                  conditions:
+                    - type: query
+                      evaluator: {params: [0.8], type: gt}
+                      operator: {type: and}
+                      query: {params: [A]}
+                      reducer: {params: [], type: last}
+
+          - uid: homelab-ram-high
+            title: "RAM > 85%"
+            condition: B
+            for: 5m
+            noDataState: NoData
+            execErrState: Error
+            annotations:
+              summary: "RAM alta: {{ $labels.instance }}"
+              description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
+            labels:
+              severity: warning
+            isPaused: false
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model:
+                  editorMode: code
+                  expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
+                  instant: true
+                  refId: A
+              - refId: B
+                relativeTimeRange: {from: 0, to: 0}
+                datasourceUid: "-100"
+                model:
+                  type: threshold
+                  refId: B
+                  conditions:
+                    - type: query
+                      evaluator: {params: [0.85], type: gt}
+                      operator: {type: and}
+                      query: {params: [A]}
+                      reducer: {params: [], type: last}
+
+          - uid: homelab-pod-failed
+            title: "Pod Failed/Unknown"
+            condition: B
+            for: 3m
+            noDataState: OK
+            execErrState: Error
+            annotations:
+              summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})"
+              description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
+            labels:
+              severity: warning
+            isPaused: false
+            data:
+              - refId: A
+                relativeTimeRange: {from: 300, to: 0}
+                datasourceUid: prometheus
+                model:
+                  editorMode: code
+                  expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}"
+                  instant: true
+                  refId: A
+              - refId: B
+                relativeTimeRange: {from: 0, to: 0}
+                datasourceUid: "-100"
+                model:
+                  type: threshold
+                  refId: B
+                  conditions:
+                    - type: query
+                      evaluator: {params: [0], type: gt}
+                      operator: {type: and}
+                      query: {params: [A]}
+                      reducer: {params: [], type: last}
diff --git a/monitoring/deployment-kube-prometheus-stack-grafana.yaml b/monitoring/deployment-kube-prometheus-stack-grafana.yaml
index 19ef1d2..a023d46 100644
--- a/monitoring/deployment-kube-prometheus-stack-grafana.yaml
+++ b/monitoring/deployment-kube-prometheus-stack-grafana.yaml
@@ -152,6 +152,16 @@ spec:
           value: /etc/grafana/provisioning
         - name: GF_UNIFIED_STORAGE_INDEX_PATH
           value: /var/lib/grafana-search/bleve
+        - name: TELEGRAM_BOT_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: grafana-telegram
+              key: TELEGRAM_BOT_TOKEN
+        - name: TELEGRAM_CHAT_ID
+          valueFrom:
+            secretKeyRef:
+              name: grafana-telegram
+              key: TELEGRAM_CHAT_ID
         image: docker.io/grafana/grafana:12.4.2
         imagePullPolicy: IfNotPresent
         livenessProbe:
@@ -213,6 +223,8 @@ spec:
           subPath: provider.yaml
         - mountPath: /etc/grafana/provisioning/datasources
           name: sc-datasources-volume
+        - mountPath: /etc/grafana/provisioning/alerting
+          name: grafana-alerting
       dnsPolicy: ClusterFirst
       enableServiceLinks: true
       initContainers:
@@ -270,4 +282,8 @@ spec:
         name: sc-dashboard-provider
       - emptyDir: {}
         name: sc-datasources-volume
+      - configMap:
+          defaultMode: 420
+          name: grafana-alerting
+        name: grafana-alerting
 
diff --git a/monitoring/secret-grafana-telegram.yaml b/monitoring/secret-grafana-telegram.yaml
new file mode 100644
index 0000000..89fb585
--- /dev/null
+++ b/monitoring/secret-grafana-telegram.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: grafana-telegram
+  namespace: monitoring
+type: Opaque
+data:
+  TELEGRAM_BOT_TOKEN: ODYxMTkxMzgwMjpBQUZsckZ0YzB2WUlTT2xpT19XOEI0Yy1XMXVlMGhHOUZpbw==
+  TELEGRAM_CHAT_ID: NTEzODQwNzY2Ng==