apiVersion: v1 kind: ConfigMap metadata: name: grafana-alerting namespace: monitoring data: contact-points.yaml: | apiVersion: 1 contactPoints: - orgId: 1 name: Telegram Homelab receivers: - uid: telegram-homelab type: telegram settings: bottoken: "${TELEGRAM_BOT_TOKEN}" chatid: "5138407666" parseMode: HTML disableResolveMessage: false notification-policy.yaml: | apiVersion: 1 policies: - orgId: 1 receiver: Telegram Homelab group_by: - alertname - namespace - pod group_wait: 30s group_interval: 5m repeat_interval: 4h routes: [] alert-rules.yaml: | apiVersion: 1 groups: - orgId: 1 name: homelab-infra folder: Homelab Alerts interval: 1m rules: - uid: homelab-crashloop title: "Pod CrashLoopBackOff" condition: C for: 1m noDataState: OK execErrState: Error annotations: summary: "CrashLoopBackOff: {{ $labels.pod }} ({{ $labels.namespace }})" description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff." labels: severity: critical isPaused: false data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: editorMode: code expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} instant: true refId: A - refId: B relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: reduce refId: B expression: A reducer: last settings: mode: "" - refId: C relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: threshold refId: C expression: B conditions: - evaluator: {params: [0], type: gt} - uid: homelab-disk-high title: "Disco > 80%" condition: C for: 5m noDataState: NoData execErrState: Error annotations: summary: "Disco lleno: {{ $labels.mountpoint }} en {{ $labels.instance }}" description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso." labels: severity: warning isPaused: false data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: editorMode: code expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}" instant: true refId: A - refId: B relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: reduce refId: B expression: A reducer: last settings: mode: "" - refId: C relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: threshold refId: C expression: B conditions: - evaluator: {params: [0.8], type: gt} - uid: homelab-ram-high title: "RAM > 85%" condition: C for: 5m noDataState: NoData execErrState: Error annotations: summary: "RAM alta: {{ $labels.instance }}" description: "Uso de RAM supera el 85% en {{ $labels.instance }}." labels: severity: warning isPaused: false data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: editorMode: code expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes" instant: true refId: A - refId: B relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: reduce refId: B expression: A reducer: last settings: mode: "" - refId: C relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: threshold refId: C expression: B conditions: - evaluator: {params: [0.85], type: gt} - uid: homelab-pod-failed title: "Pod Failed/Unknown" condition: C for: 3m noDataState: OK execErrState: Error annotations: summary: "Pod en estado {{ $labels.phase }}: {{ $labels.pod }} ({{ $labels.namespace }})" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}." labels: severity: warning isPaused: false data: - refId: A relativeTimeRange: {from: 300, to: 0} datasourceUid: prometheus model: editorMode: code expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"}" instant: true refId: A - refId: B relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: reduce refId: B expression: A reducer: last settings: mode: "" - refId: C relativeTimeRange: {from: 0, to: 0} datasourceUid: "-100" model: type: threshold refId: C expression: B conditions: - evaluator: {params: [0], type: gt}