k8s-manifests/monitoring/configmap-grafana-alerting.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-alerting
  namespace: monitoring
data:
  contact-points.yaml: |
    apiVersion: 1
    contactPoints:
      - orgId: 1
        name: Telegram Homelab
        receivers:
          - uid: telegram-homelab
            type: telegram
            settings:
              bottoken: "${TELEGRAM_BOT_TOKEN}"
              chatid: "5138407666"
              parseMode: HTML
              message: "{{ range .Alerts }}{{ if eq .Status \"firing\" }}{{ .Annotations.summary }}\n{{ else }}✅ Resuelto: {{ .Labels.alertname }}\n{{ end }}{{ end }}"
            disableResolveMessage: false

  notification-policy.yaml: |
    apiVersion: 1
    policies:
      - orgId: 1
        receiver: Telegram Homelab
        group_by:
          - alertname
          - namespace
          - pod
        group_wait: 30s
        group_interval: 5m
        repeat_interval: 4h
        routes: []

  alert-rules.yaml: |
    apiVersion: 1
    groups:
      - orgId: 1
        name: homelab-infra
        folder: Homelab Alerts
        interval: 1m
        rules:

          - uid: homelab-crashloop
            title: "Pod CrashLoopBackOff"
            condition: C
            for: 1m
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "🔄 CrashLoop: {{ $labels.pod }}\nNamespace: {{ $labels.namespace }}"
              description: "Container {{ $labels.container }} ha entrado en CrashLoopBackOff."
            labels:
              severity: critical
            isPaused: false
            data:
              - refId: A
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: prometheus
                model:
                  editorMode: code
                  expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}
                  instant: true
                  refId: A
              - refId: B
                relativeTimeRange: {from: 0, to: 0}
                datasourceUid: "-100"
                model:
                  type: reduce
                  refId: B
                  expression: A
                  reducer: last
                  settings:
                    mode: ""
              - refId: C
                relativeTimeRange: {from: 0, to: 0}
                datasourceUid: "-100"
                model:
                  type: threshold
                  refId: C
                  expression: B
                  conditions:
                    - evaluator: {params: [0], type: gt}

          - uid: homelab-disk-high
            title: "Disco > 80%"
            condition: C
            for: 5m
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "💾 Disco casi lleno: {{ $labels.mountpoint }}\nUso: {{ humanizePercentage $values.B.Value }}"
              description: "Filesystem {{ $labels.mountpoint }} supera el 80% de uso."
            labels:
              severity: warning
            isPaused: false
            data:
              - refId: A
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: prometheus
                model:
                  editorMode: code
                  expr: "(node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}) / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs|devtmpfs\"}"
                  instant: true
                  refId: A
              - refId: B
                relativeTimeRange: {from: 0, to: 0}
                datasourceUid: "-100"
                model:
                  type: reduce
                  refId: B
                  expression: A
                  reducer: last
                  settings:
                    mode: ""
              - refId: C
                relativeTimeRange: {from: 0, to: 0}
                datasourceUid: "-100"
                model:
                  type: threshold
                  refId: C
                  expression: B
                  conditions:
                    - evaluator: {params: [0.8], type: gt}

          - uid: homelab-ram-high
            title: "RAM > 85%"
            condition: C
            for: 5m
            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "🧠 RAM alta: {{ $labels.instance }}\nUso: {{ humanizePercentage $values.B.Value }}"
              description: "Uso de RAM supera el 85% en {{ $labels.instance }}."
            labels:
              severity: warning
            isPaused: false
            data:
              - refId: A
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: prometheus
                model:
                  editorMode: code
                  expr: "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
                  instant: true
                  refId: A
              - refId: B
                relativeTimeRange: {from: 0, to: 0}
                datasourceUid: "-100"
                model:
                  type: reduce
                  refId: B
                  expression: A
                  reducer: last
                  settings:
                    mode: ""
              - refId: C
                relativeTimeRange: {from: 0, to: 0}
                datasourceUid: "-100"
                model:
                  type: threshold
                  refId: C
                  expression: B
                  conditions:
                    - evaluator: {params: [0.85], type: gt}

          - uid: homelab-pod-failed
            title: "Pod Failed/Unknown"
            condition: C
            for: 3m
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "🚨 Pod caído: {{ $labels.pod }}\nNamespace: {{ $labels.namespace }}\nEstado: {{ $labels.phase }}"
              description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} lleva más de 3 minutos en estado {{ $labels.phase }}."
            labels:
              severity: warning
            isPaused: false
            data:
              - refId: A
                relativeTimeRange: {from: 300, to: 0}
                datasourceUid: prometheus
                model:
                  editorMode: code
                  expr: "kube_pod_status_phase{phase!~\"Running|Succeeded\"} == 1"
                  instant: true
                  refId: A
              - refId: B
                relativeTimeRange: {from: 0, to: 0}
                datasourceUid: "-100"
                model:
                  type: reduce
                  refId: B
                  expression: A
                  reducer: last
                  settings:
                    mode: ""
              - refId: C
                relativeTimeRange: {from: 0, to: 0}
                datasourceUid: "-100"
                model:
                  type: threshold
                  refId: C
                  expression: B
                  conditions:
                    - evaluator: {params: [0], type: gt}