feat(monitoring): Grafana alerting → Telegram for homelab
- Secret grafana-telegram: bot token + chat ID (env var injection)
- ConfigMap grafana-alerting: provisioning files for contact point,
notification policy, and 4 alert rules
* Pod CrashLoopBackOff (for: 1m, noData: OK)
* Disk > 80% on non-tmpfs filesystems (for: 5m)
* RAM > 85% (for: 5m)
* Pod Failed/Unknown (for: 3m, noData: OK)
- Deployment: TELEGRAM_* env vars from secret + alerting volume mount
Token interpolated via ${TELEGRAM_BOT_TOKEN} in provisioning YAML.
This commit is contained in:
@@ -0,0 +1,178 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-alerting
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
contact-points.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
contactPoints:
|
||||||
|
- orgId: 1
|
||||||
|
name: Telegram Homelab
|
||||||
|
receivers:
|
||||||
|
- uid: telegram-homelab
|
||||||
|
type: telegram
|
||||||
|
settings:
|
||||||
|
bottoken: "${TELEGRAM_BOT_TOKEN}"
|
||||||
|
chatid: "${TELEGRAM_CHAT_ID}"
|
||||||
|
parseMode: HTML
|
||||||
|
disableResolveMessage: false
|
||||||
|
|
||||||
|
notification-policy.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
policies:
|
||||||
|
- orgId: 1
|
||||||
|
receiver: Telegram Homelab
|
||||||
|
group_by:
|
||||||
|
- alertname
|
||||||
|
- namespace
|
||||||
|
- pod
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 4h
|
||||||
|
routes: []
|
||||||
|
|
||||||
|
alert-rules.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
groups:
|
||||||
|
- orgId: 1
|
||||||
|
name: homelab-infra
|
||||||
|
folder: Homelab Alerts
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- uid: homelab-crashloop
|
||||||
|
title: "Pod CrashLoopBackOff"
|
||||||
|
condition: B
|
||||||
|
for: 1m
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})"
|
||||||
|
description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
isPaused: false
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
|
||||||
|
instant: true
|
||||||
|
refId: A
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
|
datasourceUid: "-100"
|
||||||
|
model:
|
||||||
|
type: threshold
|
||||||
|
refId: B
|
||||||
|
conditions:
|
||||||
|
- type: query
|
||||||
|
evaluator: {params: [0], type: gt}
|
||||||
|
operator: {type: and}
|
||||||
|
query: {params: [A]}
|
||||||
|
reducer: {params: [], type: last}
|
||||||
|
|
||||||
|
- uid: homelab-disk-high
|
||||||
|
title: "Disco > 80%"
|
||||||
|
condition: B
|
||||||
|
for: 5m
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}"
|
||||||
|
description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
isPaused: false
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
|
||||||
|
instant: true
|
||||||
|
refId: A
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
|
datasourceUid: "-100"
|
||||||
|
model:
|
||||||
|
type: threshold
|
||||||
|
refId: B
|
||||||
|
conditions:
|
||||||
|
- type: query
|
||||||
|
evaluator: {params: [0.8], type: gt}
|
||||||
|
operator: {type: and}
|
||||||
|
query: {params: [A]}
|
||||||
|
reducer: {params: [], type: last}
|
||||||
|
|
||||||
|
- uid: homelab-ram-high
|
||||||
|
title: "RAM > 85%"
|
||||||
|
condition: B
|
||||||
|
for: 5m
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "RAM alta: {{ $labels.instance }}"
|
||||||
|
description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
isPaused: false
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
|
||||||
|
instant: true
|
||||||
|
refId: A
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
|
datasourceUid: "-100"
|
||||||
|
model:
|
||||||
|
type: threshold
|
||||||
|
refId: B
|
||||||
|
conditions:
|
||||||
|
- type: query
|
||||||
|
evaluator: {params: [0.85], type: gt}
|
||||||
|
operator: {type: and}
|
||||||
|
query: {params: [A]}
|
||||||
|
reducer: {params: [], type: last}
|
||||||
|
|
||||||
|
- uid: homelab-pod-failed
|
||||||
|
title: "Pod Failed/Unknown"
|
||||||
|
condition: B
|
||||||
|
for: 3m
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})"
|
||||||
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
isPaused: false
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: {from: 300, to: 0}
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}"
|
||||||
|
instant: true
|
||||||
|
refId: A
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
|
datasourceUid: "-100"
|
||||||
|
model:
|
||||||
|
type: threshold
|
||||||
|
refId: B
|
||||||
|
conditions:
|
||||||
|
- type: query
|
||||||
|
evaluator: {params: [0], type: gt}
|
||||||
|
operator: {type: and}
|
||||||
|
query: {params: [A]}
|
||||||
|
reducer: {params: [], type: last}
|
||||||
@@ -152,6 +152,16 @@ spec:
|
|||||||
value: /etc/grafana/provisioning
|
value: /etc/grafana/provisioning
|
||||||
- name: GF_UNIFIED_STORAGE_INDEX_PATH
|
- name: GF_UNIFIED_STORAGE_INDEX_PATH
|
||||||
value: /var/lib/grafana-search/bleve
|
value: /var/lib/grafana-search/bleve
|
||||||
|
- name: TELEGRAM_BOT_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: grafana-telegram
|
||||||
|
key: TELEGRAM_BOT_TOKEN
|
||||||
|
- name: TELEGRAM_CHAT_ID
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: grafana-telegram
|
||||||
|
key: TELEGRAM_CHAT_ID
|
||||||
image: docker.io/grafana/grafana:12.4.2
|
image: docker.io/grafana/grafana:12.4.2
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
@@ -213,6 +223,8 @@ spec:
|
|||||||
subPath: provider.yaml
|
subPath: provider.yaml
|
||||||
- mountPath: /etc/grafana/provisioning/datasources
|
- mountPath: /etc/grafana/provisioning/datasources
|
||||||
name: sc-datasources-volume
|
name: sc-datasources-volume
|
||||||
|
- mountPath: /etc/grafana/provisioning/alerting
|
||||||
|
name: grafana-alerting
|
||||||
dnsPolicy: ClusterFirst
|
dnsPolicy: ClusterFirst
|
||||||
enableServiceLinks: true
|
enableServiceLinks: true
|
||||||
initContainers:
|
initContainers:
|
||||||
@@ -270,4 +282,8 @@ spec:
|
|||||||
name: sc-dashboard-provider
|
name: sc-dashboard-provider
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
name: sc-datasources-volume
|
name: sc-datasources-volume
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
name: grafana-alerting
|
||||||
|
name: grafana-alerting
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,9 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: grafana-telegram
|
||||||
|
namespace: monitoring
|
||||||
|
type: Opaque
|
||||||
|
data:
|
||||||
|
TELEGRAM_BOT_TOKEN: ODYxMTkxMzgwMjpBQUZsckZ0YzB2WUlTT2xpT19XOEI0Yy1XMXVlMGhHOUZpbw==
|
||||||
|
TELEGRAM_CHAT_ID: NTEzODQwNzY2Ng==
|
||||||
Reference in New Issue
Block a user