Files
k8s-manifests/monitoring/configmap-grafana-alerting.yaml
chemavx 4897ca3334 feat(grafana): custom emoji message templates per alert + resolve format
Each alert rule's summary annotation now renders a formatted Telegram
message with emoji and multiline context. The contact point passes the
pre-rendered summary through, adding " Resuelto" on resolution.
Also restores the == 1 filter on Pod Failed/Unknown lost in prior rebase.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 07:26:01 +00:00

208 lines
7.0 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-alerting
namespace: monitoring
data:
contact-points.yaml: |
apiVersion: 1
contactPoints:
- orgId: 1
name: Telegram Homelab
receivers:
- uid: telegram-homelab
type: telegram
settings:
bottoken: "${TELEGRAM_BOT_TOKEN}"
chatid: "5138407666"
parseMode: HTML
message: "{{ range .Alerts }}{{ if eq .Status \"firing\" }}{{ .Annotations.summary }}\n{{ else }}✅ Resuelto: {{ .Labels.alertname }}\n{{ end }}{{ end }}"
disableResolveMessage: false
notification-policy.yaml: |
apiVersion: 1
policies:
- orgId: 1
receiver: Telegram Homelab
group_by:
- alertname
- namespace
- pod
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes: []
alert-rules.yaml: |
apiVersion: 1
groups:
- orgId: 1
name: homelab-infra
folder: Homelab Alerts
interval: 1m
rules:
- uid: homelab-crashloop
title: "Pod CrashLoopBackOff"
condition: C
for: 1m
noDataState: OK
execErrState: Error
annotations:
summary: "🔄 CrashLoop: {{ $labels.pod }}\nNamespace: {{ $labels.namespace }}"
description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
labels:
severity: critical
isPaused: false
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model:
editorMode: code
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
instant: true
refId: A
- refId: B
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: reduce
refId: B
expression: A
reducer: last
settings:
mode: ""
- refId: C
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: C
expression: B
conditions:
- evaluator: {params: [0], type: gt}
- uid: homelab-disk-high
title: "Disco > 80%"
condition: C
for: 5m
noDataState: NoData
execErrState: Error
annotations:
summary: "💾 Disco casi lleno: {{ $labels.mountpoint }}\nUso: {{ humanizePercentage $values.B.Value }}"
description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
labels:
severity: warning
isPaused: false
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model:
editorMode: code
expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
instant: true
refId: A
- refId: B
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: reduce
refId: B
expression: A
reducer: last
settings:
mode: ""
- refId: C
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: C
expression: B
conditions:
- evaluator: {params: [0.8], type: gt}
- uid: homelab-ram-high
title: "RAM > 85%"
condition: C
for: 5m
noDataState: NoData
execErrState: Error
annotations:
summary: "🧠 RAM alta: {{ $labels.instance }}\nUso: {{ humanizePercentage $values.B.Value }}"
description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
labels:
severity: warning
isPaused: false
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model:
editorMode: code
expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
instant: true
refId: A
- refId: B
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: reduce
refId: B
expression: A
reducer: last
settings:
mode: ""
- refId: C
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: C
expression: B
conditions:
- evaluator: {params: [0.85], type: gt}
- uid: homelab-pod-failed
title: "Pod Failed/Unknown"
condition: C
for: 3m
noDataState: OK
execErrState: Error
annotations:
summary: "🚨 Pod caído: {{ $labels.pod }}\nNamespace: {{ $labels.namespace }}\nEstado: {{ $labels.phase }}"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
labels:
severity: warning
isPaused: false
data:
- refId: A
relativeTimeRange: {from: 300, to: 0}
datasourceUid: prometheus
model:
editorMode: code
expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"} == 1"
instant: true
refId: A
- refId: B
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: reduce
refId: B
expression: A
reducer: last
settings:
mode: ""
- refId: C
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: C
expression: B
conditions:
- evaluator: {params: [0], type: gt}