Files
k8s-manifests/monitoring/configmap-grafana-alerting.yaml
T
chemavx bb64cc9e62 fix(monitoring): hardcode chatid as string in Telegram contact point
Grafana env var substitution of a numeric TELEGRAM_CHAT_ID caused
json unmarshal error (number into string field). chatid is not sensitive
so hardcode it directly; only bottoken uses ${TELEGRAM_BOT_TOKEN}.
2026-04-26 15:40:21 +00:00

179 lines
6.1 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-alerting
namespace: monitoring
data:
contact-points.yaml: |
apiVersion: 1
contactPoints:
- orgId: 1
name: Telegram Homelab
receivers:
- uid: telegram-homelab
type: telegram
settings:
bottoken: "${TELEGRAM_BOT_TOKEN}"
chatid: "5138407666"
parseMode: HTML
disableResolveMessage: false
notification-policy.yaml: |
apiVersion: 1
policies:
- orgId: 1
receiver: Telegram Homelab
group_by:
- alertname
- namespace
- pod
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes: []
alert-rules.yaml: |
apiVersion: 1
groups:
- orgId: 1
name: homelab-infra
folder: Homelab Alerts
interval: 1m
rules:
- uid: homelab-crashloop
title: "Pod CrashLoopBackOff"
condition: B
for: 1m
noDataState: OK
execErrState: Error
annotations:
summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})"
description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
labels:
severity: critical
isPaused: false
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model:
editorMode: code
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
instant: true
refId: A
- refId: B
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: B
conditions:
- type: query
evaluator: {params: [0], type: gt}
operator: {type: and}
query: {params: [A]}
reducer: {params: [], type: last}
- uid: homelab-disk-high
title: "Disco > 80%"
condition: B
for: 5m
noDataState: NoData
execErrState: Error
annotations:
summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}"
description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
labels:
severity: warning
isPaused: false
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model:
editorMode: code
expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
instant: true
refId: A
- refId: B
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: B
conditions:
- type: query
evaluator: {params: [0.8], type: gt}
operator: {type: and}
query: {params: [A]}
reducer: {params: [], type: last}
- uid: homelab-ram-high
title: "RAM > 85%"
condition: B
for: 5m
noDataState: NoData
execErrState: Error
annotations:
summary: "RAM alta: {{ $labels.instance }}"
description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
labels:
severity: warning
isPaused: false
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model:
editorMode: code
expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
instant: true
refId: A
- refId: B
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: B
conditions:
- type: query
evaluator: {params: [0.85], type: gt}
operator: {type: and}
query: {params: [A]}
reducer: {params: [], type: last}
- uid: homelab-pod-failed
title: "Pod Failed/Unknown"
condition: B
for: 3m
noDataState: OK
execErrState: Error
annotations:
summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
labels:
severity: warning
isPaused: false
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model:
editorMode: code
expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}"
instant: true
refId: A
- refId: B
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: B
conditions:
- type: query
evaluator: {params: [0], type: gt}
operator: {type: and}
query: {params: [A]}
reducer: {params: [], type: last}