apiVersion: v1 kind: ConfigMap metadata: name: grafana-alerting namespace: monitoring data: contact-points.yaml: | apiVersion: 1 contactPoints: - orgId: 1 name: Telegram Homelab receivers: - uid: telegram-homelab type: telegram settings: bottoken: "${TELEGRAM_BOT_TOKEN}" chatid: "${TELEGRAM_CHAT_ID}" parseMode: HTML disableResolveMessage: false notification-policy.yaml: | apiVersion: 1 policies: - orgId: 1 receiver: Telegram Homelab group_by: - alertname - namespace - pod group_wait: 30s group_interval: 5m repeat_interval: 4h routes: [] alert-rules.yaml: | apiVersion: 1 groups: - orgId: 1 name: homelab-infra folder: Homelab Alerts interval: 1m rules: - uid: homelab-crashloop title: "Pod CrashLoopBackOff" condition: B for: 1m noDataState: OK execErrState: Error annotations: summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})" description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff." labels: severity: critical isPaused: false data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: editorMode: code expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} instant: true refId: A - refId: B relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: threshold refId: B conditions: - type: query evaluator: {params: [0], type: gt} operator: {type: and} query: {params: [A]} reducer: {params: [], type: last} - uid: homelab-disk-high title: "Disco > 80%" condition: B for: 5m noDataState: NoData execErrState: Error annotations: summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}" description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso." labels: severity: warning isPaused: false data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: editorMode: code expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}" instant: true refId: A - refId: B relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: threshold refId: B conditions: - type: query evaluator: {params: [0.8], type: gt} operator: {type: and} query: {params: [A]} reducer: {params: [], type: last} - uid: homelab-ram-high title: "RAM > 85%" condition: B for: 5m noDataState: NoData execErrState: Error annotations: summary: "RAM alta: {{ $labels.instance }}" description: "Uso de RAM supera el 85% en {{ $labels.instance }}." labels: severity: warning isPaused: false data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: editorMode: code expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes" instant: true refId: A - refId: B relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: threshold refId: B conditions: - type: query evaluator: {params: [0.85], type: gt} operator: {type: and} query: {params: [A]} reducer: {params: [], type: last} - uid: homelab-pod-failed title: "Pod Failed/Unknown" condition: B for: 3m noDataState: OK execErrState: Error annotations: summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}." labels: severity: warning isPaused: false data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: editorMode: code expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}" instant: true refId: A - refId: B relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: threshold refId: B conditions: - type: query evaluator: {params: [0], type: gt} operator: {type: and} query: {params: [A]} reducer: {params: [], type: last}