From 94c059ccb90efac58ddff2bac0b11210ee0a68ee Mon Sep 17 00:00:00 2001 From: chemavx Date: Sun, 26 Apr 2026 15:25:07 +0000 Subject: [PATCH] =?UTF-8?q?feat(monitoring):=20Grafana=20alerting=20?= =?UTF-8?q?=E2=86=92=20Telegram=20for=20homelab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Secret grafana-telegram: bot token + chat ID (env var injection) - ConfigMap grafana-alerting: provisioning files for contact point, notification policy, and 4 alert rules * Pod CrashLoopBackOff (for: 1m, noData: OK) * Disk > 80% on non-tmpfs filesystems (for: 5m) * RAM > 85% (for: 5m) * Pod Failed/Unknown (for: 3m, noData: OK) - Deployment: TELEGRAM_* env vars from secret + alerting volume mount Token interpolated via ${TELEGRAM_BOT_TOKEN} in provisioning YAML. --- monitoring/configmap-grafana-alerting.yaml | 178 ++++++++++++++++++ ...loyment-kube-prometheus-stack-grafana.yaml | 16 ++ monitoring/secret-grafana-telegram.yaml | 9 + 3 files changed, 203 insertions(+) create mode 100644 monitoring/configmap-grafana-alerting.yaml create mode 100644 monitoring/secret-grafana-telegram.yaml diff --git a/monitoring/configmap-grafana-alerting.yaml b/monitoring/configmap-grafana-alerting.yaml new file mode 100644 index 0000000..b80c6f9 --- /dev/null +++ b/monitoring/configmap-grafana-alerting.yaml @@ -0,0 +1,178 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-alerting + namespace: monitoring +data: + contact-points.yaml: | + apiVersion: 1 + contactPoints: + - orgId: 1 + name: Telegram Homelab + receivers: + - uid: telegram-homelab + type: telegram + settings: + bottoken: "${TELEGRAM_BOT_TOKEN}" + chatid: "${TELEGRAM_CHAT_ID}" + parseMode: HTML + disableResolveMessage: false + + notification-policy.yaml: | + apiVersion: 1 + policies: + - orgId: 1 + receiver: Telegram Homelab + group_by: + - alertname + - namespace + - pod + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + routes: [] + + alert-rules.yaml: | + apiVersion: 1 + groups: + - orgId: 1 + name: homelab-infra + folder: Homelab Alerts + interval: 1m + rules: + + - uid: homelab-crashloop + title: "Pod CrashLoopBackOff" + condition: B + for: 1m + noDataState: OK + execErrState: Error + annotations: + summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})" + description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff." + labels: + severity: critical + isPaused: false + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: + editorMode: code + expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} + instant: true + refId: A + - refId: B + relativeTimeRange: {from: 0, to: 0} + datasourceUid: "-100" + model: + type: threshold + refId: B + conditions: + - type: query + evaluator: {params: [0], type: gt} + operator: {type: and} + query: {params: [A]} + reducer: {params: [], type: last} + + - uid: homelab-disk-high + title: "Disco > 80%" + condition: B + for: 5m + noDataState: NoData + execErrState: Error + annotations: + summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}" + description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso." + labels: + severity: warning + isPaused: false + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: + editorMode: code + expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}" + instant: true + refId: A + - refId: B + relativeTimeRange: {from: 0, to: 0} + datasourceUid: "-100" + model: + type: threshold + refId: B + conditions: + - type: query + evaluator: {params: [0.8], type: gt} + operator: {type: and} + query: {params: [A]} + reducer: {params: [], type: last} + + - uid: homelab-ram-high + title: "RAM > 85%" + condition: B + for: 5m + noDataState: NoData + execErrState: Error + annotations: + summary: "RAM alta: {{ $labels.instance }}" + description: "Uso de RAM supera el 85% en {{ $labels.instance }}." + labels: + severity: warning + isPaused: false + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: + editorMode: code + expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes" + instant: true + refId: A + - refId: B + relativeTimeRange: {from: 0, to: 0} + datasourceUid: "-100" + model: + type: threshold + refId: B + conditions: + - type: query + evaluator: {params: [0.85], type: gt} + operator: {type: and} + query: {params: [A]} + reducer: {params: [], type: last} + + - uid: homelab-pod-failed + title: "Pod Failed/Unknown" + condition: B + for: 3m + noDataState: OK + execErrState: Error + annotations: + summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})" + description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}." + labels: + severity: warning + isPaused: false + data: + - refId: A + relativeTimeRange: {from: 300, to: 0} + datasourceUid: prometheus + model: + editorMode: code + expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}" + instant: true + refId: A + - refId: B + relativeTimeRange: {from: 0, to: 0} + datasourceUid: "-100" + model: + type: threshold + refId: B + conditions: + - type: query + evaluator: {params: [0], type: gt} + operator: {type: and} + query: {params: [A]} + reducer: {params: [], type: last} diff --git a/monitoring/deployment-kube-prometheus-stack-grafana.yaml b/monitoring/deployment-kube-prometheus-stack-grafana.yaml index 19ef1d2..a023d46 100644 --- a/monitoring/deployment-kube-prometheus-stack-grafana.yaml +++ b/monitoring/deployment-kube-prometheus-stack-grafana.yaml @@ -152,6 +152,16 @@ spec: value: /etc/grafana/provisioning - name: GF_UNIFIED_STORAGE_INDEX_PATH value: /var/lib/grafana-search/bleve + - name: TELEGRAM_BOT_TOKEN + valueFrom: + secretKeyRef: + name: grafana-telegram + key: TELEGRAM_BOT_TOKEN + - name: TELEGRAM_CHAT_ID + valueFrom: + secretKeyRef: + name: grafana-telegram + key: TELEGRAM_CHAT_ID image: docker.io/grafana/grafana:12.4.2 imagePullPolicy: IfNotPresent livenessProbe: @@ -213,6 +223,8 @@ spec: subPath: provider.yaml - mountPath: /etc/grafana/provisioning/datasources name: sc-datasources-volume + - mountPath: /etc/grafana/provisioning/alerting + name: grafana-alerting dnsPolicy: ClusterFirst enableServiceLinks: true initContainers: @@ -270,4 +282,8 @@ spec: name: sc-dashboard-provider - emptyDir: {} name: sc-datasources-volume + - configMap: + defaultMode: 420 + name: grafana-alerting + name: grafana-alerting diff --git a/monitoring/secret-grafana-telegram.yaml b/monitoring/secret-grafana-telegram.yaml new file mode 100644 index 0000000..89fb585 --- /dev/null +++ b/monitoring/secret-grafana-telegram.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: grafana-telegram + namespace: monitoring +type: Opaque +data: + TELEGRAM_BOT_TOKEN: ODYxMTkxMzgwMjpBQUZsckZ0YzB2WUlTT2xpT19XOEI0Yy1XMXVlMGhHOUZpbw== + TELEGRAM_CHAT_ID: NTEzODQwNzY2Ng==