apiVersion: v1 data: monitoring-kube-prometheus-stack-config-reloaders-eae692b3-e0b3-459e-8981-8dc6d7da6055.yaml: "groups:\n\ - name: config-reloaders\n rules:\n - alert: ConfigReloaderSidecarErrors\n \ \ annotations:\n description: |-\n Errors encountered while the\ \ {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}}\ \ namespace.\n As a result, configuration for service running in {{$labels.pod}}\ \ may be stale and cannot be updated anymore.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors\n\ \ summary: config-reloader sidecar has not had a successful reload for 10m\n\ \ expr: max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m])\ \ == 0\n for: 10m\n labels:\n severity: warning\n" monitoring-kube-prometheus-stack-etcd-de0d66c3-becc-4bd9-8ab6-dff75f452f02.yaml: "groups:\n\ - name: etcd\n rules:\n - alert: etcdMembersDown\n annotations:\n description:\ \ 'etcd cluster \"{{ $labels.job }}\": members are down ({{ $value\n }}).'\n\ \ summary: etcd cluster members are down.\n expr: |-\n max without\ \ (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} ==\ \ bool 0)\n or\n count without (To) (\n sum without (instance,\ \ pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s]))\ \ > 0.01\n )\n )\n > 0\n for: 20m\n labels:\n severity:\ \ warning\n - alert: etcdInsufficientMembers\n annotations:\n description:\ \ 'etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value\n \ \ }}).'\n summary: etcd cluster has insufficient number of members.\n \ \ expr: sum(up{job=~\".*etcd.*\"} == bool 1) without (instance, pod) < ((count(up{job=~\"\ .*etcd.*\"})\n without (instance, pod) + 1) / 2)\n for: 3m\n labels:\n\ \ severity: critical\n - alert: etcdNoLeader\n annotations:\n description:\ \ 'etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }}\n \ \ has no leader.'\n summary: etcd cluster has no leader.\n expr: etcd_server_has_leader{job=~\"\ .*etcd.*\"} == 0\n for: 1m\n labels:\n severity: critical\n - alert:\ \ etcdHighNumberOfLeaderChanges\n annotations:\n description: 'etcd cluster\ \ \"{{ $labels.job }}\": {{ $value }} leader changes\n within the last\ \ 15 minutes. Frequent elections may be a sign of insufficient\n resources,\ \ high network latency, or disruptions by other components and should\n \ \ be investigated.'\n summary: etcd cluster has high number of leader changes.\n\ \ expr: increase((max without (instance, pod) (etcd_server_leader_changes_seen_total{job=~\"\ .*etcd.*\"})\n or 0*absent(etcd_server_leader_changes_seen_total{job=~\"\ .*etcd.*\"}))[15m:1m])\n >= 4\n for: 5m\n labels:\n severity:\ \ warning\n - alert: etcdHighNumberOfFailedGRPCRequests\n annotations:\n \ \ description: 'etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests\ \ for\n {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance\ \ }}.'\n summary: etcd cluster has high number of failed grpc requests.\n\ \ expr: |-\n 100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\"\ , grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\ }[5m])) without (grpc_type, grpc_code)\n /\n sum(rate(grpc_server_handled_total{job=~\"\ .*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 1\n for: 10m\n\ \ labels:\n severity: warning\n - alert: etcdHighNumberOfFailedGRPCRequests\n\ \ annotations:\n description: 'etcd cluster \"{{ $labels.job }}\": {{\ \ $value }}% of requests for\n {{ $labels.grpc_method }} failed on etcd\ \ instance {{ $labels.instance }}.'\n summary: etcd cluster has high number\ \ of failed grpc requests.\n expr: |-\n 100 * sum(rate(grpc_server_handled_total{job=~\"\ .*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\ }[5m])) without (grpc_type, grpc_code)\n /\n sum(rate(grpc_server_handled_total{job=~\"\ .*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 5\n for: 5m\n \ \ labels:\n severity: critical\n - alert: etcdGRPCRequestsSlow\n annotations:\n\ \ description: 'etcd cluster \"{{ $labels.job }}\": 99th percentile of gRPC\ \ requests\n is {{ $value }}s on etcd instance {{ $labels.instance }} for\ \ {{ $labels.grpc_method\n }} method.'\n summary: etcd grpc requests\ \ are slow\n expr: |-\n histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\"\ .*etcd.*\", grpc_method!=\"Defragment\", grpc_type=\"unary\"}[5m])) without(grpc_type))\n\ \ > 0.15\n for: 10m\n labels:\n severity: critical\n - alert:\ \ etcdMemberCommunicationSlow\n annotations:\n description: 'etcd cluster\ \ \"{{ $labels.job }}\": member communication with {{\n $labels.To }} is\ \ taking {{ $value }}s on etcd instance {{ $labels.instance\n }}.'\n \ \ summary: etcd cluster member communication is slow.\n expr: |-\n \ \ histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\"\ .*etcd.*\"}[5m]))\n > 0.15\n for: 10m\n labels:\n severity: warning\n\ \ - alert: etcdHighNumberOfFailedProposals\n annotations:\n description:\ \ 'etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures\n \ \ within the last 30 minutes on etcd instance {{ $labels.instance }}.'\n \ \ summary: etcd cluster has high number of proposal failures.\n expr: rate(etcd_server_proposals_failed_total{job=~\"\ .*etcd.*\"}[15m]) > 5\n for: 15m\n labels:\n severity: warning\n -\ \ alert: etcdHighFsyncDurations\n annotations:\n description: 'etcd cluster\ \ \"{{ $labels.job }}\": 99th percentile fsync durations\n are {{ $value\ \ }}s on etcd instance {{ $labels.instance }}.'\n summary: etcd cluster 99th\ \ percentile fsync durations are too high.\n expr: |-\n histogram_quantile(0.99,\ \ rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n\ \ > 0.5\n for: 10m\n labels:\n severity: warning\n - alert: etcdHighFsyncDurations\n\ \ annotations:\n description: 'etcd cluster \"{{ $labels.job }}\": 99th\ \ percentile fsync durations\n are {{ $value }}s on etcd instance {{ $labels.instance\ \ }}.'\n summary: etcd cluster 99th percentile fsync durations are too high.\n\ \ expr: |-\n histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\"\ .*etcd.*\"}[5m]))\n > 1\n for: 10m\n labels:\n severity: critical\n\ \ - alert: etcdHighCommitDurations\n annotations:\n description: 'etcd\ \ cluster \"{{ $labels.job }}\": 99th percentile commit durations\n {{\ \ $value }}s on etcd instance {{ $labels.instance }}.'\n summary: etcd cluster\ \ 99th percentile commit durations are too high.\n expr: |-\n histogram_quantile(0.99,\ \ rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n\ \ > 0.25\n for: 10m\n labels:\n severity: warning\n - alert:\ \ etcdDatabaseQuotaLowSpace\n annotations:\n description: 'etcd cluster\ \ \"{{ $labels.job }}\": database size exceeds the defined\n quota on etcd\ \ instance {{ $labels.instance }}, please defrag or increase the\n quota\ \ as the writes to etcd will be disabled when it is full.'\n summary: etcd\ \ cluster database is running full.\n expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~\"\ .*etcd.*\"}[5m]) /\n last_over_time(etcd_server_quota_backend_bytes{job=~\"\ .*etcd.*\"}[5m]))*100 >\n 95\n for: 10m\n labels:\n severity:\ \ critical\n - alert: etcdExcessiveDatabaseGrowth\n annotations:\n description:\ \ 'etcd cluster \"{{ $labels.job }}\": Predicting running out of disk\n \ \ space in the next four hours, based on write observations within the past\n\ \ four hours on etcd instance {{ $labels.instance }}, please check as it\ \ might\n be disruptive.'\n summary: etcd cluster database growing\ \ very fast.\n expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~\"\ .*etcd.*\"}[4h], 4*60*60)\n > etcd_server_quota_backend_bytes{job=~\".*etcd.*\"\ }\n for: 10m\n labels:\n severity: warning\n - alert: etcdDatabaseHighFragmentationRatio\n\ \ annotations:\n description: 'etcd cluster \"{{ $labels.job }}\": database\ \ size in use on instance\n {{ $labels.instance }} is {{ $value | humanizePercentage\ \ }} of the actual\n allocated disk space, please run defragmentation (e.g.\ \ etcdctl defrag) to\n retrieve the unused fragmented disk space.'\n \ \ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation\n\ \ summary: etcd database size in use is less than 50% of the actual allocated\n\ \ storage.\n expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~\"\ .*etcd.*\"}[5m])\n / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~\"\ .*etcd.*\"}[5m])) < 0.5\n and etcd_mvcc_db_total_size_in_use_in_bytes{job=~\"\ .*etcd.*\"} > 104857600\n for: 10m\n labels:\n severity: warning\n" monitoring-kube-prometheus-stack-general.rules-f627c7c8-ea4b-4b56-98b8-e667d6567e7b.yaml: "groups:\n\ - name: general.rules\n rules:\n - alert: TargetDown\n annotations:\n \ \ description: '{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service\n\ \ }} targets in {{ $labels.namespace }} namespace are down.'\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/general/targetdown\n \ \ summary: One or more targets are unreachable.\n expr: 100 * (count(up ==\ \ 0) BY (cluster, job, namespace, service) / count(up)\n BY (cluster, job,\ \ namespace, service)) > 10\n for: 10m\n labels:\n severity: warning\n\ \ - alert: Watchdog\n annotations:\n description: |\n This is\ \ an alert meant to ensure that the entire alerting pipeline is functional.\n\ \ This alert is always firing, therefore it should always be firing in\ \ Alertmanager\n and always fire against a receiver. There are integrations\ \ with various notification\n mechanisms that send a notification when\ \ this alert is not firing. For example the\n \"DeadMansSnitch\" integration\ \ in PagerDuty.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog\n\ \ summary: An alert that should always be firing to certify that Alertmanager\n\ \ is working properly.\n expr: vector(1)\n labels:\n severity:\ \ none\n - alert: InfoInhibitor\n annotations:\n description: |\n \ \ This is an alert that is used to inhibit info alerts.\n By themselves,\ \ the info-level alerts are sometimes very noisy, but they are relevant when combined\ \ with\n other alerts.\n This alert fires whenever there's a severity=\"\ info\" alert, and stops firing when another alert with a\n severity of\ \ 'warning' or 'critical' starts firing on the same namespace.\n This alert\ \ should be routed to a null receiver and configured to inhibit alerts with severity=\"\ info\".\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor\n\ \ summary: Info-level alert inhibition.\n expr: group by (namespace) (ALERTS{severity\ \ = \"info\"} == 1) unless on (namespace)\n group by (namespace) (ALERTS{alertname\ \ != \"InfoInhibitor\", alertstate = \"firing\",\n severity =~ \"warning|critical\"\ } == 1)\n labels:\n severity: none\n" monitoring-kube-prometheus-stack-k8s.rules.container-cpu-usage-seconds-tot-715fb365-db24-4478-8fdf-40df40c31616.yaml: "groups:\n\ - name: k8s.rules.container_cpu_usage_seconds_total\n rules:\n - expr: |-\n\ \ sum by (cluster, namespace, pod, container) (\n rate(container_cpu_usage_seconds_total{job=\"\ kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}[5m])\n ) * on\ \ (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod)\ \ (\n 1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\ \"})\n )\n record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m\n\ \ - expr: |-\n sum by (cluster, namespace, pod, container) (\n irate(container_cpu_usage_seconds_total{job=\"\ kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}[5m])\n ) * on\ \ (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod)\ \ (\n 1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\ \"})\n )\n record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate\n" monitoring-kube-prometheus-stack-k8s.rules.container-memory-cache-c548651c-95bc-4bc3-a2d8-6fb97abc9ec3.yaml: "groups:\n\ - name: k8s.rules.container_memory_cache\n rules:\n - expr: |-\n container_memory_cache{job=\"\ kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n * on (cluster,\ \ namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,\n \ \ max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n \ \ )\n record: node_namespace_pod_container:container_memory_cache\n" monitoring-kube-prometheus-stack-k8s.rules.container-memory-rss-1baf12c0-1dce-4867-86be-91c1ee948313.yaml: "groups:\n\ - name: k8s.rules.container_memory_rss\n rules:\n - expr: |-\n container_memory_rss{job=\"\ kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n * on (cluster,\ \ namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,\n \ \ max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n \ \ )\n record: node_namespace_pod_container:container_memory_rss\n" monitoring-kube-prometheus-stack-k8s.rules.container-memory-swap-dd2afe1d-3a0d-44ed-b97a-0fc8e29e111e.yaml: "groups:\n\ - name: k8s.rules.container_memory_swap\n rules:\n - expr: |-\n container_memory_swap{job=\"\ kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n * on (cluster,\ \ namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,\n \ \ max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n \ \ )\n record: node_namespace_pod_container:container_memory_swap\n" monitoring-kube-prometheus-stack-k8s.rules.container-memory-working-set-by-b44b3b4f-1d0e-466a-9b08-a078ed6f1588.yaml: "groups:\n\ - name: k8s.rules.container_memory_working_set_bytes\n rules:\n - expr: |-\n\ \ container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"\ , image!=\"\"}\n * on (cluster, namespace, pod) group_left(node) topk by\ \ (cluster, namespace, pod) (1,\n max by (cluster, namespace, pod, node)\ \ (kube_pod_info{node!=\"\"})\n )\n record: node_namespace_pod_container:container_memory_working_set_bytes\n" monitoring-kube-prometheus-stack-k8s.rules.container-resource-f779d497-1ce1-46cf-8234-c68eca5f1472.yaml: "groups:\n\ - name: k8s.rules.container_resource\n rules:\n - expr: |-\n kube_pod_container_resource_requests{resource=\"\ memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\n group_left()\ \ max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"\ Pending|Running\"} == 1)\n )\n record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests\n\ \ - expr: |-\n sum by (namespace, cluster) (\n sum by (namespace,\ \ pod, cluster) (\n max by (namespace, pod, container, cluster) (\n\ \ kube_pod_container_resource_requests{resource=\"memory\",job=\"\ kube-state-metrics\"}\n ) * on (namespace, pod, cluster) group_left()\ \ max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"\ Pending|Running\"} == 1\n )\n )\n )\n record: namespace_memory:kube_pod_container_resource_requests:sum\n\ \ - expr: |-\n kube_pod_container_resource_requests{resource=\"cpu\",job=\"\ kube-state-metrics\"} * on (namespace, pod, cluster)\n group_left() max\ \ by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"\ } == 1)\n )\n record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests\n\ \ - expr: |-\n sum by (namespace, cluster) (\n sum by (namespace,\ \ pod, cluster) (\n max by (namespace, pod, container, cluster) (\n\ \ kube_pod_container_resource_requests{resource=\"cpu\",job=\"\ kube-state-metrics\"}\n ) * on (namespace, pod, cluster) group_left()\ \ max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"\ Pending|Running\"} == 1\n )\n )\n )\n record: namespace_cpu:kube_pod_container_resource_requests:sum\n\ \ - expr: |-\n kube_pod_container_resource_limits{resource=\"memory\",job=\"\ kube-state-metrics\"} * on (namespace, pod, cluster)\n group_left() max\ \ by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"\ } == 1)\n )\n record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits\n\ \ - expr: |-\n sum by (namespace, cluster) (\n sum by (namespace,\ \ pod, cluster) (\n max by (namespace, pod, container, cluster) (\n\ \ kube_pod_container_resource_limits{resource=\"memory\",job=\"\ kube-state-metrics\"}\n ) * on (namespace, pod, cluster) group_left()\ \ max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"\ Pending|Running\"} == 1\n )\n )\n )\n record: namespace_memory:kube_pod_container_resource_limits:sum\n\ \ - expr: |-\n kube_pod_container_resource_limits{resource=\"cpu\",job=\"\ kube-state-metrics\"} * on (namespace, pod, cluster)\n group_left() max\ \ by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"\ } == 1)\n )\n record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits\n\ \ - expr: |-\n sum by (namespace, cluster) (\n sum by (namespace,\ \ pod, cluster) (\n max by (namespace, pod, container, cluster) (\n\ \ kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"\ }\n ) * on (namespace, pod, cluster) group_left() max by (namespace,\ \ pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"\ } == 1\n )\n )\n )\n record: namespace_cpu:kube_pod_container_resource_limits:sum\n" monitoring-kube-prometheus-stack-k8s.rules.pod-owner-6921c522-c42e-43f6-8c4c-296d6fd5994e.yaml: "groups:\n\ - name: k8s.rules.pod_owner\n rules:\n - expr: |-\n max by (cluster, namespace,\ \ workload, pod) (\n label_replace(\n label_replace(\n \ \ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n\ \ \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on\ \ (cluster, replicaset, namespace) group_left(owner_name) topk by (cluster, replicaset,\ \ namespace) (\n 1, max by (cluster, replicaset, namespace, owner_name)\ \ (\n kube_replicaset_owner{job=\"kube-state-metrics\", owner_kind=\"\ \"}\n )\n ),\n \"workload\", \"$1\", \"replicaset\"\ , \"(.*)\"\n )\n )\n labels:\n workload_type: replicaset\n\ \ record: namespace_workload_pod:kube_pod_owner:relabel\n - expr: |-\n \ \ max by (cluster, namespace, workload, pod) (\n label_replace(\n \ \ label_replace(\n kube_pod_owner{job=\"kube-state-metrics\"\ , owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\"\ , \"(.*)\"\n ) * on (replicaset, namespace, cluster) group_left(owner_name)\ \ topk by (cluster, replicaset, namespace) (\n 1, max by (cluster,\ \ replicaset, namespace, owner_name) (\n kube_replicaset_owner{job=\"\ kube-state-metrics\", owner_kind=\"Deployment\"}\n )\n ),\n\ \ \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n )\n\ \ labels:\n workload_type: deployment\n record: namespace_workload_pod:kube_pod_owner:relabel\n\ \ - expr: |-\n max by (cluster, namespace, workload, pod) (\n label_replace(\n\ \ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"\ },\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n \ \ )\n labels:\n workload_type: daemonset\n record: namespace_workload_pod:kube_pod_owner:relabel\n\ \ - expr: |-\n max by (cluster, namespace, workload, pod) (\n label_replace(\n\ \ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"\ },\n \"workload\", \"$1\", \"owner_name\", \"(.*)\")\n )\n labels:\n\ \ workload_type: statefulset\n record: namespace_workload_pod:kube_pod_owner:relabel\n\ \ - expr: |-\n group by (cluster, namespace, workload, pod) (\n label_join(\n\ \ group by (cluster, namespace, job_name, pod, owner_name) (\n \ \ label_join(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"\ Job\"}\n , \"job_name\", \"\", \"owner_name\")\n )\n \ \ * on (cluster, namespace, job_name) group_left()\n group by (cluster,\ \ namespace, job_name) (\n kube_job_owner{job=\"kube-state-metrics\"\ , owner_kind=~\"Pod|\"}\n )\n , \"workload\", \"\", \"owner_name\"\ )\n )\n labels:\n workload_type: job\n record: namespace_workload_pod:kube_pod_owner:relabel\n\ \ - expr: |-\n max by (cluster, namespace, workload, pod) (\n label_replace(\n\ \ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"\", owner_name=\"\ \"},\n \"workload\", \"$1\", \"pod\", \"(.+)\")\n )\n labels:\n\ \ workload_type: barepod\n record: namespace_workload_pod:kube_pod_owner:relabel\n\ \ - expr: |-\n max by (cluster, namespace, workload, pod) (\n label_replace(\n\ \ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"Node\"},\n\ \ \"workload\", \"$1\", \"pod\", \"(.+)\")\n )\n labels:\n \ \ workload_type: staticpod\n record: namespace_workload_pod:kube_pod_owner:relabel\n\ \ - expr: |-\n group by (cluster, namespace, workload, workload_type, pod)\ \ (\n label_join(\n label_join(\n group by (cluster,\ \ namespace, job_name, pod) (\n label_join(\n kube_pod_owner{job=\"\ kube-state-metrics\", owner_kind=\"Job\"}\n , \"job_name\", \"\"\ , \"owner_name\")\n )\n * on (cluster, namespace, job_name)\ \ group_left(owner_kind, owner_name)\n group by (cluster, namespace,\ \ job_name, owner_kind, owner_name) (\n kube_job_owner{job=\"kube-state-metrics\"\ , owner_kind!=\"Pod\", owner_kind!=\"\"}\n )\n , \"workload\"\ , \"\", \"owner_name\")\n , \"workload_type\", \"\", \"owner_kind\")\n\n\ \ OR\n\n label_replace(\n label_replace(\n label_replace(\n\ \ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"\ }\n , \"replicaset\", \"$1\", \"owner_name\", \"(.+)\"\n \ \ )\n * on (cluster, namespace, replicaset) group_left(owner_kind,\ \ owner_name)\n group by (cluster, namespace, replicaset, owner_kind,\ \ owner_name) (\n kube_replicaset_owner{job=\"kube-state-metrics\"\ , owner_kind!=\"Deployment\", owner_kind!=\"\"}\n )\n , \"\ workload\", \"$1\", \"owner_name\", \"(.+)\")\n OR\n label_replace(\n\ \ group by (cluster, namespace, pod, owner_name, owner_kind) (\n \ \ kube_pod_owner{job=\"kube-state-metrics\", owner_kind!=\"ReplicaSet\"\ , owner_kind!=\"DaemonSet\", owner_kind!=\"StatefulSet\", owner_kind!=\"Job\"\ , owner_kind!=\"Node\", owner_kind!=\"\"}\n )\n , \"workload\"\ , \"$1\", \"owner_name\", \"(.+)\"\n )\n , \"workload_type\",\ \ \"$1\", \"owner_kind\", \"(.+)\")\n )\n record: namespace_workload_pod:kube_pod_owner:relabel\n" monitoring-kube-prometheus-stack-kube-apiserver-availability.rules-fe1708b8-332c-4d9d-be30-1a6e49774f5a.yaml: "groups:\n\ - interval: 3m\n name: kube-apiserver-availability.rules\n rules:\n - expr:\ \ avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 *\n \ \ 30\n record: code_verb:apiserver_request_total:increase30d\n - expr:\ \ sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"\ LIST|GET\"})\n labels:\n verb: read\n record: code:apiserver_request_total:increase30d\n\ \ - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"\ POST|PUT|PATCH|DELETE\"})\n labels:\n verb: write\n record: code:apiserver_request_total:increase30d\n\ \ - expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))\n\ \ record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h\n\ \ - expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d])\n\ \ * 24 * 30)\n record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d\n\ \ - expr: sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le=\"\ +Inf\"})\n record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h\n\ \ - expr: sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le=\"\ +Inf\"})\n record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d\n\ \ - expr: |-\n 1 - (\n (\n # write too slow\n sum\ \ by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"\ POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\ POST|PUT|PATCH|DELETE\",le=~\"1(\\\\.0)?\"} or vector(0))\n ) +\n \ \ (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"\ LIST|GET\"})\n -\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\ LIST|GET\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"} or vector(0))\n \ \ +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\ LIST|GET\",scope=\"namespace\",le=~\"5(\\\\.0)?\"} or vector(0))\n \ \ +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\ LIST|GET\",scope=\"cluster\",le=~\"30(\\\\.0)?\"} or vector(0))\n )\n\ \ ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"\ 5..\"} or vector(0))\n )\n /\n sum by (cluster) (code:apiserver_request_total:increase30d)\n\ \ labels:\n verb: all\n record: apiserver_request:availability30d\n\ \ - expr: |-\n 1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"\ LIST|GET\"})\n -\n (\n # too slow\n sum by (cluster)\ \ (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\ LIST|GET\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"} or vector(0))\n +\n\ \ sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\ LIST|GET\",scope=\"namespace\",le=~\"5(\\\\.0)?\"} or vector(0))\n +\n\ \ sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\ LIST|GET\",scope=\"cluster\",le=~\"30(\\\\.0)?\"} or vector(0))\n )\n \ \ +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"\ read\",code=~\"5..\"} or vector(0))\n )\n /\n sum by (cluster)\ \ (code:apiserver_request_total:increase30d{verb=\"read\"})\n labels:\n \ \ verb: read\n record: apiserver_request:availability30d\n - expr: |-\n\ \ 1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"\ POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\ POST|PUT|PATCH|DELETE\",le=~\"1(\\\\.0)?\"} or vector(0))\n )\n \ \ +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"\ write\",code=~\"5..\"} or vector(0))\n )\n /\n sum by (cluster)\ \ (code:apiserver_request_total:increase30d{verb=\"write\"})\n labels:\n \ \ verb: write\n record: apiserver_request:availability30d\n - expr: sum\ \ by (cluster,code,resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"\ LIST|GET\"}[5m]))\n labels:\n verb: read\n record: code_resource:apiserver_request_total:rate5m\n\ \ - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n labels:\n verb: write\n\ \ record: code_resource:apiserver_request_total:rate5m\n - expr: sum by (cluster,\ \ code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\"\ ,code=~\"2..\"}[1h]))\n record: code_verb:apiserver_request_total:increase1h\n\ \ - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job=\"\ apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\",code=~\"3..\"}[1h]))\n \ \ record: code_verb:apiserver_request_total:increase1h\n - expr: sum by (cluster,\ \ code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\"\ ,code=~\"4..\"}[1h]))\n record: code_verb:apiserver_request_total:increase1h\n\ \ - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job=\"\ apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n \ \ record: code_verb:apiserver_request_total:increase1h\n" monitoring-kube-prometheus-stack-kube-apiserver-burnrate.rules-5b37d5cc-ec92-44ef-8b83-84bd7039e174.yaml: "groups:\n\ - name: kube-apiserver-burnrate.rules\n rules:\n - expr: |-\n (\n \ \ (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[1d]))\n -\n (\n (\n sum by (cluster)\ \ (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"\ LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\"\ ,le=~\"1(\\\\.0)?\"}[1d]))\n or\n vector(0)\n \ \ )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[1d]))\n +\n sum\ \ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\ ,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\ cluster\",le=~\"30(\\\\.0)?\"}[1d]))\n )\n )\n +\n \ \ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n )\n /\n sum by (cluster)\ \ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n\ \ labels:\n verb: read\n record: apiserver_request:burnrate1d\n -\ \ expr: |-\n (\n (\n # too slow\n sum by (cluster)\ \ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\ LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n \ \ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[1h]))\n or\n \ \ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[1h]))\n +\n sum\ \ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\ ,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\ cluster\",le=~\"30(\\\\.0)?\"}[1h]))\n )\n )\n +\n \ \ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n )\n /\n sum by (cluster)\ \ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n\ \ labels:\n verb: read\n record: apiserver_request:burnrate1h\n -\ \ expr: |-\n (\n (\n # too slow\n sum by (cluster)\ \ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\ LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n \ \ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[2h]))\n or\n \ \ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[2h]))\n +\n sum\ \ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\ ,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\ cluster\",le=~\"30(\\\\.0)?\"}[2h]))\n )\n )\n +\n \ \ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n )\n /\n sum by (cluster)\ \ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n\ \ labels:\n verb: read\n record: apiserver_request:burnrate2h\n -\ \ expr: |-\n (\n (\n # too slow\n sum by (cluster)\ \ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\ LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n \ \ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[30m]))\n or\n \ \ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[30m]))\n +\n sum\ \ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\ ,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\ cluster\",le=~\"30(\\\\.0)?\"}[30m]))\n )\n )\n +\n \ \ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n )\n /\n \ \ sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"\ }[30m]))\n labels:\n verb: read\n record: apiserver_request:burnrate30m\n\ \ - expr: |-\n (\n (\n # too slow\n sum by (cluster)\ \ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\ LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n \ \ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[3d]))\n or\n \ \ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[3d]))\n +\n sum\ \ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\ ,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\ cluster\",le=~\"30(\\\\.0)?\"}[3d]))\n )\n )\n +\n \ \ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n )\n /\n sum by (cluster)\ \ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n\ \ labels:\n verb: read\n record: apiserver_request:burnrate3d\n -\ \ expr: |-\n (\n (\n # too slow\n sum by (cluster)\ \ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\ LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n \ \ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[5m]))\n or\n \ \ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[5m]))\n +\n sum\ \ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\ ,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\ cluster\",le=~\"30(\\\\.0)?\"}[5m]))\n )\n )\n +\n \ \ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n )\n /\n sum by (cluster)\ \ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n\ \ labels:\n verb: read\n record: apiserver_request:burnrate5m\n -\ \ expr: |-\n (\n (\n # too slow\n sum by (cluster)\ \ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\ LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n \ \ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[6h]))\n or\n \ \ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[6h]))\n +\n sum\ \ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\ ,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\ cluster\",le=~\"30(\\\\.0)?\"}[6h]))\n )\n )\n +\n \ \ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n )\n /\n sum by (cluster)\ \ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n\ \ labels:\n verb: read\n record: apiserver_request:burnrate6h\n -\ \ expr: |-\n (\n (\n # too slow\n sum by (cluster)\ \ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\ POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n\ \ -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,le=~\"1(\\\\.0)?\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n )\n \ \ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n labels:\n verb: write\n record:\ \ apiserver_request:burnrate1d\n - expr: |-\n (\n (\n #\ \ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[1h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,le=~\"1(\\\\.0)?\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n )\n \ \ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n labels:\n verb: write\n record:\ \ apiserver_request:burnrate1h\n - expr: |-\n (\n (\n #\ \ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[2h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,le=~\"1(\\\\.0)?\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n )\n \ \ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n labels:\n verb: write\n record:\ \ apiserver_request:burnrate2h\n - expr: |-\n (\n (\n #\ \ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[30m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,le=~\"1(\\\\.0)?\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n )\n \ \ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n labels:\n verb: write\n \ \ record: apiserver_request:burnrate30m\n - expr: |-\n (\n (\n \ \ # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[3d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,le=~\"1(\\\\.0)?\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n )\n \ \ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n labels:\n verb: write\n record:\ \ apiserver_request:burnrate3d\n - expr: |-\n (\n (\n #\ \ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[5m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,le=~\"1(\\\\.0)?\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n )\n \ \ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n labels:\n verb: write\n record:\ \ apiserver_request:burnrate5m\n - expr: |-\n (\n (\n #\ \ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[6h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ ,le=~\"1(\\\\.0)?\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n )\n \ \ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\ ,verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n labels:\n verb: write\n record:\ \ apiserver_request:burnrate6h\n" monitoring-kube-prometheus-stack-kube-apiserver-histogram.rules-3df1ba86-8ec2-4750-a04f-5e59108c7ba3.yaml: "groups:\n\ - name: kube-apiserver-histogram.rules\n rules:\n - expr: histogram_quantile(0.99,\ \ sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[5m])))\n > 0\n labels:\n quantile: \"0.99\"\n verb: read\n\ \ record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\ apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\ }[5m])))\n > 0\n labels:\n quantile: \"0.99\"\n verb: write\n\ \ record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile\n" monitoring-kube-prometheus-stack-kube-apiserver-slos-be960e5a-cbba-488b-b2a9-b89b70183179.yaml: "groups:\n\ - name: kube-apiserver-slos\n rules:\n - alert: KubeAPIErrorBudgetBurn\n \ \ annotations:\n description: The API server is burning too much error budget\ \ on cluster {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn\n\ \ summary: The API server is burning too much error budget.\n expr: |-\n\ \ sum by (cluster) (apiserver_request:burnrate1h) > (14.40 * 0.01000)\n \ \ and on (cluster)\n sum by (cluster) (apiserver_request:burnrate5m)\ \ > (14.40 * 0.01000)\n for: 2m\n labels:\n long: 1h\n severity:\ \ critical\n short: 5m\n - alert: KubeAPIErrorBudgetBurn\n annotations:\n\ \ description: The API server is burning too much error budget on cluster\ \ {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn\n\ \ summary: The API server is burning too much error budget.\n expr: |-\n\ \ sum by (cluster) (apiserver_request:burnrate6h) > (6.00 * 0.01000)\n \ \ and on (cluster)\n sum by (cluster) (apiserver_request:burnrate30m)\ \ > (6.00 * 0.01000)\n for: 15m\n labels:\n long: 6h\n severity:\ \ critical\n short: 30m\n - alert: KubeAPIErrorBudgetBurn\n annotations:\n\ \ description: The API server is burning too much error budget on cluster\ \ {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn\n\ \ summary: The API server is burning too much error budget.\n expr: |-\n\ \ sum by (cluster) (apiserver_request:burnrate1d) > (3.00 * 0.01000)\n \ \ and on (cluster)\n sum by (cluster) (apiserver_request:burnrate2h) >\ \ (3.00 * 0.01000)\n for: 1h\n labels:\n long: 1d\n severity:\ \ warning\n short: 2h\n - alert: KubeAPIErrorBudgetBurn\n annotations:\n\ \ description: The API server is burning too much error budget on cluster\ \ {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn\n\ \ summary: The API server is burning too much error budget.\n expr: |-\n\ \ sum by (cluster) (apiserver_request:burnrate3d) > (1.00 * 0.01000)\n \ \ and on (cluster)\n sum by (cluster) (apiserver_request:burnrate6h) >\ \ (1.00 * 0.01000)\n for: 3h\n labels:\n long: 3d\n severity:\ \ warning\n short: 6h\n" monitoring-kube-prometheus-stack-kube-prometheus-general.rules-cdf9488c-4fa3-4c7a-9be4-ddcaee437598.yaml: "groups:\n\ - name: kube-prometheus-general.rules\n rules:\n - expr: count without(instance,\ \ pod, node) (up == 1)\n record: count:up1\n - expr: count without(instance,\ \ pod, node) (up == 0)\n record: count:up0\n" monitoring-kube-prometheus-stack-kube-prometheus-node-recording.rules-14de50cd-57b8-4248-a7c5-054469786b93.yaml: "groups:\n\ - name: kube-prometheus-node-recording.rules\n rules:\n - expr: sum(rate(node_cpu_seconds_total{mode!=\"\ idle\",mode!=\"iowait\",mode!=\"steal\"}[3m]))\n BY (instance)\n record:\ \ instance:node_cpu:rate:sum\n - expr: sum(rate(node_network_receive_bytes_total[3m]))\ \ BY (instance)\n record: instance:node_network_receive_bytes:rate:sum\n -\ \ expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)\n record:\ \ instance:node_network_transmit_bytes:rate:sum\n - expr: sum(rate(node_cpu_seconds_total{mode!=\"\ idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))\n WITHOUT (cpu, mode) / ON\ \ (instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)\n BY (instance,\ \ cpu)) BY (instance)\n record: instance:node_cpu:ratio\n - expr: sum(rate(node_cpu_seconds_total{mode!=\"\ idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))\n record: cluster:node_cpu:sum_rate5m\n\ \ - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY\ \ (instance,\n cpu))\n record: cluster:node_cpu:ratio\n" monitoring-kube-prometheus-stack-kube-scheduler.rules-da454218-3276-463d-abe3-7043553c8f35.yaml: "groups:\n\ - name: kube-scheduler.rules\n rules:\n - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.99\"\n record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.99\"\n record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.99, sum(rate(scheduler_pod_scheduling_sli_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.99\"\n record: cluster_quantile:scheduler_pod_scheduling_sli_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.9\"\n record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.9\"\n record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.9, sum(rate(scheduler_pod_scheduling_sli_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.9\"\n record: cluster_quantile:scheduler_pod_scheduling_sli_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.5\"\n record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.5\"\n record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile\n\ \ - expr: histogram_quantile(0.5, sum(rate(scheduler_pod_scheduling_sli_duration_seconds_bucket{job=\"\ kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\ \ \"0.5\"\n record: cluster_quantile:scheduler_pod_scheduling_sli_duration_seconds:histogram_quantile\n" monitoring-kube-prometheus-stack-kube-state-metrics-9826e852-e343-4d08-9f0c-4c5896358ba2.yaml: "groups:\n\ - name: kube-state-metrics\n rules:\n - alert: KubeStateMetricsListErrors\n\ \ annotations:\n description: kube-state-metrics is experiencing errors\ \ at an elevated rate in\n list operations. This is likely causing it to\ \ not be able to expose metrics\n about Kubernetes objects correctly or\ \ at all.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors\n\ \ summary: kube-state-metrics is experiencing errors in list operations.\n\ \ expr: |-\n (sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"\ ,result=\"error\"}[5m])) by (cluster)\n /\n sum(rate(kube_state_metrics_list_total{job=\"\ kube-state-metrics\"}[5m])) by (cluster))\n > 0.01\n for: 15m\n labels:\n\ \ severity: critical\n - alert: KubeStateMetricsWatchErrors\n annotations:\n\ \ description: kube-state-metrics is experiencing errors at an elevated rate\ \ in\n watch operations. This is likely causing it to not be able to expose\ \ metrics\n about Kubernetes objects correctly or at all.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors\n\ \ summary: kube-state-metrics is experiencing errors in watch operations.\n\ \ expr: |-\n (sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"\ ,result=\"error\"}[5m])) by (cluster)\n /\n sum(rate(kube_state_metrics_watch_total{job=\"\ kube-state-metrics\"}[5m])) by (cluster))\n > 0.01\n for: 15m\n labels:\n\ \ severity: critical\n - alert: KubeStateMetricsShardingMismatch\n annotations:\n\ \ description: kube-state-metrics pods are running with different --total-shards\n\ \ configuration, some Kubernetes objects may be exposed multiple times\ \ or not\n exposed at all.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch\n\ \ summary: kube-state-metrics sharding is misconfigured.\n expr: stdvar\ \ (kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) by (cluster)\n\ \ != 0\n for: 15m\n labels:\n severity: critical\n - alert: KubeStateMetricsShardsMissing\n\ \ annotations:\n description: kube-state-metrics shards are missing, some\ \ Kubernetes objects\n are not being exposed.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing\n\ \ summary: kube-state-metrics shards are missing.\n expr: |-\n 2^max(kube_state_metrics_total_shards{job=\"\ kube-state-metrics\"}) by (cluster) - 1\n -\n sum( 2 ^ max by (cluster,\ \ shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"\ }) ) by (cluster)\n != 0\n for: 15m\n labels:\n severity: critical\n" monitoring-kube-prometheus-stack-kubelet.rules-931b5e0c-2d70-4f8b-9987-5b5cfaac8845.yaml: "groups:\n\ - name: kubelet.rules\n rules:\n - expr: |-\n histogram_quantile(\n \ \ 0.99,\n sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job=\"\ kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)\n \ \ * on (cluster, instance) group_left (node)\n max by (cluster, instance,\ \ node) (kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n \ \ )\n labels:\n quantile: \"0.99\"\n record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile\n\ \ - expr: |-\n histogram_quantile(\n 0.9,\n sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job=\"\ kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)\n \ \ * on (cluster, instance) group_left (node)\n max by (cluster, instance,\ \ node) (kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n \ \ )\n labels:\n quantile: \"0.9\"\n record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile\n\ \ - expr: |-\n histogram_quantile(\n 0.5,\n sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job=\"\ kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)\n \ \ * on (cluster, instance) group_left (node)\n max by (cluster, instance,\ \ node) (kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n \ \ )\n labels:\n quantile: \"0.5\"\n record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile\n" monitoring-kube-prometheus-stack-kubernetes-apps-eaf9c0ce-babd-40f5-913f-7c8c14272dcc.yaml: "groups:\n\ - name: kubernetes-apps\n rules:\n - alert: KubePodCrashLooping\n annotations:\n\ \ description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container\n\ \ }}) is in waiting state (reason: \"CrashLoopBackOff\") on cluster {{\ \ $labels.cluster\n }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping\n\ \ summary: Pod is crash looping.\n expr: max_over_time(kube_pod_container_status_waiting_reason{reason=\"\ CrashLoopBackOff\",\n job=\"kube-state-metrics\", namespace=~\".*\"}[5m])\ \ >= 1\n for: 15m\n labels:\n severity: warning\n - alert: KubePodNotReady\n\ \ annotations:\n description: Pod {{ $labels.namespace }}/{{ $labels.pod\ \ }} has been in a non-ready\n state for longer than 15 minutes on cluster\ \ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready\n\ \ summary: Pod has been in a non-ready state for more than 15 minutes.\n\ \ expr: |-\n sum by (namespace, pod, job, cluster) (\n max by (namespace,\ \ pod, job, cluster) (\n kube_pod_status_phase{job=\"kube-state-metrics\"\ , namespace=~\".*\", phase=~\"Pending|Unknown\"}\n ) * on (namespace, pod,\ \ cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (\n \ \ 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"\ Job\"})\n )\n ) > 0\n for: 15m\n labels:\n severity: warning\n\ \ - alert: KubeDeploymentGenerationMismatch\n annotations:\n description:\ \ Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment\n \ \ }} does not match, this indicates that the Deployment has failed but has\ \ not\n been rolled back on cluster {{ $labels.cluster }}.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch\n\ \ summary: Deployment generation mismatch due to possible roll-back\n \ \ expr: |-\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n !=\n kube_deployment_metadata_generation{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n for: 15m\n labels:\n severity:\ \ warning\n - alert: KubeDeploymentReplicasMismatch\n annotations:\n \ \ description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has\n\ \ not matched the expected number of replicas for longer than 15 minutes\ \ on\n cluster {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch\n\ \ summary: Deployment has not matched the expected number of replicas.\n\ \ expr: |-\n (\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n >\n kube_deployment_status_replicas_available{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n ) and (\n changes(kube_deployment_status_replicas_updated{job=\"\ kube-state-metrics\", namespace=~\".*\"}[10m])\n ==\n 0\n \ \ )\n for: 15m\n labels:\n severity: warning\n - alert: KubeDeploymentRolloutStuck\n\ \ annotations:\n description: Rollout of deployment {{ $labels.namespace\ \ }}/{{ $labels.deployment\n }} is not progressing for longer than 15 minutes\ \ on cluster {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck\n\ \ summary: Deployment rollout is not progressing.\n expr: |-\n kube_deployment_status_condition{condition=\"\ Progressing\", status=\"false\",job=\"kube-state-metrics\", namespace=~\".*\"\ }\n != 0\n for: 15m\n labels:\n severity: warning\n - alert:\ \ KubeStatefulSetReplicasMismatch\n annotations:\n description: StatefulSet\ \ {{ $labels.namespace }}/{{ $labels.statefulset }} has\n not matched the\ \ expected number of replicas for longer than 15 minutes on\n cluster {{\ \ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch\n\ \ summary: StatefulSet has not matched the expected number of replicas.\n\ \ expr: |-\n (\n kube_statefulset_status_replicas_ready{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n !=\n kube_statefulset_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n ) and (\n changes(kube_statefulset_status_replicas_updated{job=\"\ kube-state-metrics\", namespace=~\".*\"}[10m])\n ==\n 0\n \ \ )\n for: 15m\n labels:\n severity: warning\n - alert: KubeStatefulSetGenerationMismatch\n\ \ annotations:\n description: StatefulSet generation for {{ $labels.namespace\ \ }}/{{ $labels.statefulset\n }} does not match, this indicates that the\ \ StatefulSet has failed but has\n not been rolled back on cluster {{ $labels.cluster\ \ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch\n\ \ summary: StatefulSet generation mismatch due to possible roll-back\n \ \ expr: |-\n kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n !=\n kube_statefulset_metadata_generation{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n for: 15m\n labels:\n severity:\ \ warning\n - alert: KubeStatefulSetUpdateNotRolledOut\n annotations:\n \ \ description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}\ \ update\n has not been rolled out on cluster {{ $labels.cluster }}.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout\n\ \ summary: StatefulSet update has not been rolled out.\n expr: |-\n \ \ (\n max by (namespace, statefulset, job, cluster) (\n kube_statefulset_status_current_revision{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n unless\n kube_statefulset_status_update_revision{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n )\n * on (namespace,\ \ statefulset, job, cluster)\n (\n kube_statefulset_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n )\n ) and on (namespace,\ \ statefulset, job, cluster) (\n changes(kube_statefulset_status_replicas_updated{job=\"\ kube-state-metrics\", namespace=~\".*\"}[5m])\n ==\n 0\n \ \ )\n for: 15m\n labels:\n severity: warning\n - alert: KubeDaemonSetRolloutStuck\n\ \ annotations:\n description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset\ \ }} has not\n finished or progressed for at least 15m on cluster {{ $labels.cluster\ \ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck\n\ \ summary: DaemonSet rollout is stuck.\n expr: |-\n (\n (\n\ \ kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n !=\n 0\n \ \ ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n ) or (\n kube_daemonset_status_number_available{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n )\n ) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"\ kube-state-metrics\", namespace=~\".*\"}[5m])\n ==\n 0\n \ \ )\n for: 15m\n labels:\n severity: warning\n - alert: KubeContainerWaiting\n\ \ annotations:\n description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace\ \ }} on\n container {{ $labels.container}} has been in waiting state for\ \ longer than\n 1 hour. (reason: \"{{ $labels.reason }}\") on cluster {{\ \ $labels.cluster }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting\n\ \ summary: Pod container waiting longer than 1 hour\n expr: kube_pod_container_status_waiting_reason{reason!=\"\ CrashLoopBackOff\", job=\"kube-state-metrics\",\n namespace=~\".*\"} > 0\n\ \ for: 1h\n labels:\n severity: warning\n - alert: KubeDaemonSetNotScheduled\n\ \ annotations:\n description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace\ \ }}/{{ $labels.daemonset\n }} are not scheduled on cluster {{ $labels.cluster\ \ }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled\n\ \ summary: DaemonSet pods are not scheduled.\n expr: |-\n kube_daemonset_status_desired_number_scheduled{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n -\n kube_daemonset_status_current_number_scheduled{job=\"\ kube-state-metrics\", namespace=~\".*\"} > 0\n for: 10m\n labels:\n \ \ severity: warning\n - alert: KubeDaemonSetMisScheduled\n annotations:\n\ \ description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{\ \ $labels.daemonset\n }} are running where they are not supposed to run\ \ on cluster {{ $labels.cluster\n }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled\n\ \ summary: DaemonSet pods are misscheduled.\n expr: kube_daemonset_status_number_misscheduled{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n > 0\n for: 15m\n labels:\n\ \ severity: warning\n - alert: KubeJobNotCompleted\n annotations:\n \ \ description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking\ \ more\n than {{ \"43200\" | humanizeDuration }} to complete on cluster\ \ {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted\n\ \ summary: Job did not complete in time\n expr: |-\n time() - max\ \ by (namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n and\n kube_job_status_active{job=\"kube-state-metrics\"\ , namespace=~\".*\"} > 0) > 43200\n labels:\n severity: warning\n - alert:\ \ KubeJobFailed\n annotations:\n description: Job {{ $labels.namespace\ \ }}/{{ $labels.job_name }} failed to complete.\n Removing failed job after\ \ investigation should clear this alert on cluster\n {{ $labels.cluster\ \ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed\n\ \ summary: Job failed to complete.\n expr: kube_job_failed{job=\"kube-state-metrics\"\ , namespace=~\".*\"} > 0\n for: 15m\n labels:\n severity: warning\n\ \ - alert: KubeHpaReplicasMismatch\n annotations:\n description: HPA\ \ {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}\n has\ \ not matched the desired number of replicas for longer than 15 minutes\n \ \ on cluster {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch\n\ \ summary: HPA has not matched desired number of replicas.\n expr: |-\n\ \ (kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n !=\n kube_horizontalpodautoscaler_status_current_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"})\n and\n (kube_horizontalpodautoscaler_status_current_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n >\n kube_horizontalpodautoscaler_spec_min_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"})\n and\n (kube_horizontalpodautoscaler_status_current_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n <\n kube_horizontalpodautoscaler_spec_max_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"})\n and\n changes(kube_horizontalpodautoscaler_status_current_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"}[15m]) == 0\n for: 15m\n labels:\n\ \ severity: warning\n - alert: KubeHpaMaxedOut\n annotations:\n \ \ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler\ \ }}\n has been running at max replicas for longer than 15 minutes on\ \ cluster {{\n $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout\n\ \ summary: HPA is running at max replicas\n expr: |-\n (\n \ \ kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n ==\n kube_horizontalpodautoscaler_spec_max_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n )\n and on (namespace, horizontalpodautoscaler)\ \ (\n kube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n !=\n kube_horizontalpodautoscaler_spec_min_replicas{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n )\n for: 15m\n labels:\n\ \ severity: warning\n - alert: KubePdbNotEnoughHealthyPods\n annotations:\n\ \ description: PDB {{ $labels.cluster }}/{{ $labels.namespace }}/{{ $labels.poddisruptionbudget\n\ \ }} expects {{ $value }} more healthy pods. The desired number of healthy\ \ pods\n has not been met for at least 15m.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepdbnotenoughhealthypods\n\ \ summary: PDB does not have enough healthy pods.\n expr: |-\n (\n\ \ kube_poddisruptionbudget_status_desired_healthy{job=\"kube-state-metrics\"\ , namespace=~\".*\"}\n -\n kube_poddisruptionbudget_status_current_healthy{job=\"\ kube-state-metrics\", namespace=~\".*\"}\n )\n > 0\n for: 15m\n \ \ labels:\n severity: warning\n" monitoring-kube-prometheus-stack-kubernetes-resources-b694afc5-821c-4800-a61c-a61d36f5c15f.yaml: "groups:\n\ - name: kubernetes-resources\n rules:\n - alert: KubeCPUOvercommit\n annotations:\n\ \ description: Cluster {{ $labels.cluster }} has overcommitted CPU resource\ \ requests\n for Pods by {{ printf \"%.2f\" $value }} CPU shares and cannot\ \ tolerate node\n failure.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit\n\ \ summary: Cluster has overcommitted CPU resource requests.\n expr: |-\n\ \ # Non-HA clusters.\n (\n (\n sum by (cluster) (namespace_cpu:kube_pod_container_resource_requests:sum{})\n\ \ -\n sum by (cluster) (kube_node_status_allocatable{job=\"\ kube-state-metrics\",resource=\"cpu\"}) > 0\n )\n and\n count\ \ by (cluster) (max by (cluster, node) (kube_node_role{job=\"kube-state-metrics\"\ , role=\"control-plane\"})) < 3\n )\n or\n # HA clusters.\n \ \ (\n sum by (cluster) (namespace_cpu:kube_pod_container_resource_requests:sum{})\n\ \ -\n (\n # Skip clusters with only one allocatable node.\n\ \ (\n sum by (cluster) (kube_node_status_allocatable{job=\"\ kube-state-metrics\",resource=\"cpu\"})\n -\n max by (cluster)\ \ (kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"})\n\ \ ) > 0\n ) > 0\n )\n for: 10m\n labels:\n severity:\ \ warning\n - alert: KubeMemoryOvercommit\n annotations:\n description:\ \ Cluster {{ $labels.cluster }} has overcommitted memory resource\n requests\ \ for Pods by {{ $value | humanize }} bytes and cannot tolerate node\n \ \ failure.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit\n\ \ summary: Cluster has overcommitted memory resource requests.\n expr:\ \ |-\n # Non-HA clusters.\n (\n (\n sum by (cluster)\ \ (namespace_memory:kube_pod_container_resource_requests:sum{})\n -\n\ \ sum by (cluster) (kube_node_status_allocatable{job=\"kube-state-metrics\"\ ,resource=\"memory\"}) > 0\n )\n and\n count by (cluster)\ \ (max by (cluster, node) (kube_node_role{job=\"kube-state-metrics\", role=\"\ control-plane\"})) < 3\n )\n or\n # HA clusters.\n (\n \ \ sum by (cluster) (namespace_memory:kube_pod_container_resource_requests:sum{})\n\ \ -\n (\n # Skip clusters with only one allocatable node.\n\ \ (\n sum by (cluster) (kube_node_status_allocatable{job=\"\ kube-state-metrics\",resource=\"memory\"})\n -\n max by\ \ (cluster) (kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"\ memory\"})\n ) > 0\n ) > 0\n )\n for: 10m\n labels:\n\ \ severity: warning\n - alert: KubeCPUQuotaOvercommit\n annotations:\n\ \ description: Cluster {{ $labels.cluster }} has overcommitted CPU resource\ \ requests\n for Namespaces.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit\n\ \ summary: Cluster has overcommitted CPU resource requests.\n expr: |-\n\ \ sum by (cluster) (\n min without(resource) (kube_resourcequota{job=\"\ kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"})\n \ \ )\n /\n sum by (cluster) (\n kube_node_status_allocatable{resource=\"\ cpu\", job=\"kube-state-metrics\"}\n ) > 1.5\n for: 5m\n labels:\n\ \ severity: warning\n - alert: KubeMemoryQuotaOvercommit\n annotations:\n\ \ description: Cluster {{ $labels.cluster }} has overcommitted memory resource\n\ \ requests for Namespaces.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit\n\ \ summary: Cluster has overcommitted memory resource requests.\n expr:\ \ |-\n sum by (cluster) (\n min without(resource) (kube_resourcequota{job=\"\ kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})\n\ \ )\n /\n sum by (cluster) (\n kube_node_status_allocatable{resource=\"\ memory\", job=\"kube-state-metrics\"}\n ) > 1.5\n for: 5m\n labels:\n\ \ severity: warning\n - alert: KubeQuotaAlmostFull\n annotations:\n \ \ description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage\n\ \ }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster\ \ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull\n\ \ summary: Namespace quota is going to be full.\n expr: |-\n max\ \ without (instance, job, type) (\n kube_resourcequota{job=\"kube-state-metrics\"\ , type=\"used\"}\n )\n / on (cluster, namespace, resource, resourcequota)\ \ group_left()\n (\n max without (instance, job, type) (\n \ \ kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"}\n ) >\ \ 0\n )\n > 0.9 < 1\n for: 15m\n labels:\n severity: info\n\ \ - alert: KubeQuotaFullyUsed\n annotations:\n description: Namespace\ \ {{ $labels.namespace }} is using {{ $value | humanizePercentage\n }}\ \ of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.\n \ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused\n\ \ summary: Namespace quota is fully used.\n expr: |-\n max without\ \ (instance, job, type) (\n kube_resourcequota{job=\"kube-state-metrics\"\ , type=\"used\"}\n )\n / on (cluster, namespace, resource, resourcequota)\ \ group_left()\n (\n max without (instance, job, type) (\n \ \ kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"}\n ) >\ \ 0\n )\n == 1\n for: 15m\n labels:\n severity: info\n -\ \ alert: KubeQuotaExceeded\n annotations:\n description: Namespace {{\ \ $labels.namespace }} is using {{ $value | humanizePercentage\n }} of\ \ its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded\n\ \ summary: Namespace quota has exceeded the limits.\n expr: |-\n \ \ max without (instance, job, type) (\n kube_resourcequota{job=\"kube-state-metrics\"\ , type=\"used\"}\n )\n / on (cluster, namespace, resource, resourcequota)\ \ group_left()\n (\n max without (instance, job, type) (\n \ \ kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"}\n ) >\ \ 0\n ) > 1\n for: 15m\n labels:\n severity: warning\n - alert:\ \ CPUThrottlingHigh\n annotations:\n description: '{{ $value | humanizePercentage\ \ }} throttling of CPU in namespace\n {{ $labels.namespace }} for container\ \ {{ $labels.container }} in pod {{ $labels.pod\n }} on cluster {{ $labels.cluster\ \ }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh\n\ \ summary: Processes experience elevated CPU throttling.\n expr: |-\n\ \ sum without (id, metrics_path, name, image, endpoint, job, node) (\n \ \ topk by (cluster, namespace, pod, container, instance) (1,\n increase(\n\ \ container_cpu_cfs_throttled_periods_total{container!=\"\", job=\"\ kubelet\", metrics_path=\"/metrics/cadvisor\", }\n [5m])\n )\n\ \ )\n / on (cluster, namespace, pod, container, instance) group_left\n\ \ sum without (id, metrics_path, name, image, endpoint, job, node) (\n \ \ topk by (cluster, namespace, pod, container, instance) (1,\n increase(\n\ \ container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"\ /metrics/cadvisor\", }\n [5m])\n )\n )\n > ( 25 / 100\ \ )\n for: 15m\n labels:\n severity: info\n" monitoring-kube-prometheus-stack-kubernetes-storage-a21970f1-cefe-4cfc-876a-1833115df2e4.yaml: "groups:\n\ - name: kubernetes-storage\n rules:\n - alert: KubePersistentVolumeFillingUp\n\ \ annotations:\n description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim\n\ \ }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on\ \ Cluster\n {{ . }} {{- end }} is only {{ $value | humanizePercentage }}\ \ free.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup\n\ \ summary: PersistentVolume is filling up.\n expr: |-\n (\n \ \ kubelet_volume_stats_available_bytes{job=\"kubelet\", namespace=~\".*\", metrics_path=\"\ /metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\"\ , namespace=~\".*\", metrics_path=\"/metrics\"}\n ) < 0.03\n and\n \ \ kubelet_volume_stats_used_bytes{job=\"kubelet\", namespace=~\".*\", metrics_path=\"\ /metrics\"} > 0\n unless on (cluster, namespace, persistentvolumeclaim)\n\ \ kube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} ==\ \ 1\n unless on (cluster, namespace, persistentvolumeclaim)\n kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"\ true\"} == 1\n for: 1m\n labels:\n severity: critical\n - alert: KubePersistentVolumeFillingUp\n\ \ annotations:\n description: Based on recent sampling, the PersistentVolume\ \ claimed by {{ $labels.persistentvolumeclaim\n }} in Namespace {{ $labels.namespace\ \ }} {{ with $labels.cluster -}} on Cluster\n {{ . }} {{- end }} is expected\ \ to fill up within four days. Currently {{ $value\n | humanizePercentage\ \ }} is available.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup\n\ \ summary: PersistentVolume is filling up.\n expr: |-\n (\n \ \ kubelet_volume_stats_available_bytes{job=\"kubelet\", namespace=~\".*\", metrics_path=\"\ /metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\"\ , namespace=~\".*\", metrics_path=\"/metrics\"}\n ) < 0.15\n and\n \ \ kubelet_volume_stats_used_bytes{job=\"kubelet\", namespace=~\".*\", metrics_path=\"\ /metrics\"} > 0\n and\n predict_linear(kubelet_volume_stats_available_bytes{job=\"\ kubelet\", namespace=~\".*\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) <\ \ 0\n unless on (cluster, namespace, persistentvolumeclaim)\n kube_persistentvolumeclaim_access_mode{\ \ access_mode=\"ReadOnlyMany\"} == 1\n unless on (cluster, namespace, persistentvolumeclaim)\n\ \ kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}\ \ == 1\n for: 1h\n labels:\n severity: warning\n - alert: KubePersistentVolumeInodesFillingUp\n\ \ annotations:\n description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim\n\ \ }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on\ \ Cluster\n {{ . }} {{- end }} only has {{ $value | humanizePercentage\ \ }} free inodes.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup\n\ \ summary: PersistentVolumeInodes are filling up.\n expr: |-\n (\n\ \ kubelet_volume_stats_inodes_free{job=\"kubelet\", namespace=~\".*\",\ \ metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"\ kubelet\", namespace=~\".*\", metrics_path=\"/metrics\"}\n ) < 0.03\n \ \ and\n kubelet_volume_stats_inodes_used{job=\"kubelet\", namespace=~\"\ .*\", metrics_path=\"/metrics\"} > 0\n unless on (cluster, namespace, persistentvolumeclaim)\n\ \ kube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} ==\ \ 1\n unless on (cluster, namespace, persistentvolumeclaim)\n kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"\ true\"} == 1\n for: 1m\n labels:\n severity: critical\n - alert: KubePersistentVolumeInodesFillingUp\n\ \ annotations:\n description: Based on recent sampling, the PersistentVolume\ \ claimed by {{ $labels.persistentvolumeclaim\n }} in Namespace {{ $labels.namespace\ \ }} {{ with $labels.cluster -}} on Cluster\n {{ . }} {{- end }} is expected\ \ to run out of inodes within four days. Currently\n {{ $value | humanizePercentage\ \ }} of its inodes are free.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup\n\ \ summary: PersistentVolumeInodes are filling up.\n expr: |-\n (\n\ \ kubelet_volume_stats_inodes_free{job=\"kubelet\", namespace=~\".*\",\ \ metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"\ kubelet\", namespace=~\".*\", metrics_path=\"/metrics\"}\n ) < 0.15\n \ \ and\n kubelet_volume_stats_inodes_used{job=\"kubelet\", namespace=~\"\ .*\", metrics_path=\"/metrics\"} > 0\n and\n predict_linear(kubelet_volume_stats_inodes_free{job=\"\ kubelet\", namespace=~\".*\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) <\ \ 0\n unless on (cluster, namespace, persistentvolumeclaim)\n kube_persistentvolumeclaim_access_mode{\ \ access_mode=\"ReadOnlyMany\"} == 1\n unless on (cluster, namespace, persistentvolumeclaim)\n\ \ kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}\ \ == 1\n for: 1h\n labels:\n severity: warning\n - alert: KubePersistentVolumeErrors\n\ \ annotations:\n description: The persistent volume {{ $labels.persistentvolume\ \ }} {{ with $labels.cluster\n -}} on Cluster {{ . }} {{- end }} has status\ \ {{ $labels.phase }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors\n\ \ summary: PersistentVolume is having issues with provisioning.\n expr:\ \ kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"\ }\n > 0\n for: 5m\n labels:\n severity: critical\n" monitoring-kube-prometheus-stack-kubernetes-system-26e1e614-9a40-44cd-8622-cff2d1258a88.yaml: "groups:\n\ - name: kubernetes-system\n rules:\n - alert: KubeVersionMismatch\n annotations:\n\ \ description: There are {{ $value }} different semantic versions of Kubernetes\n\ \ components running on cluster {{ $labels.cluster }}.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch\n\ \ summary: Different semantic versions of Kubernetes components running.\n\ \ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"\ kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"\ )))\n > 1\n for: 15m\n labels:\n severity: warning\n - alert:\ \ KubeClientErrors\n annotations:\n description: Kubernetes API server\ \ client '{{ $labels.job }}/{{ $labels.instance\n }}' is experiencing {{\ \ $value | humanizePercentage }} errors on cluster {{\n $labels.cluster\ \ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors\n\ \ summary: Kubernetes API server client is experiencing errors.\n expr:\ \ |-\n (sum(rate(rest_client_requests_total{job=\"apiserver\",code=~\"5..\"\ }[5m])) by (cluster, instance, job, namespace)\n /\n sum(rate(rest_client_requests_total{job=\"\ apiserver\"}[5m])) by (cluster, instance, job, namespace))\n > 0.01\n \ \ for: 15m\n labels:\n severity: warning\n" monitoring-kube-prometheus-stack-kubernetes-system-apiserver-f9528ec5-b467-4c90-829a-f4603a9bc7d5.yaml: "groups:\n\ - name: kubernetes-system-apiserver\n rules:\n - alert: KubeClientCertificateExpiration\n\ \ annotations:\n description: A client certificate used to authenticate\ \ to kubernetes apiserver\n is expiring in less than 7.0 days on cluster\ \ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration\n\ \ summary: Client certificate is about to expire.\n expr: |-\n histogram_quantile(0.01,\ \ sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"\ apiserver\"}[5m]))) < 604800\n and\n on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"\ apiserver\"} > 0\n for: 5m\n labels:\n severity: warning\n - alert:\ \ KubeClientCertificateExpiration\n annotations:\n description: A client\ \ certificate used to authenticate to kubernetes apiserver\n is expiring\ \ in less than 24.0 hours on cluster {{ $labels.cluster }}.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration\n\ \ summary: Client certificate is about to expire.\n expr: |-\n histogram_quantile(0.01,\ \ sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"\ apiserver\"}[5m]))) < 86400\n and\n on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"\ apiserver\"} > 0\n for: 5m\n labels:\n severity: critical\n - alert:\ \ KubeAggregatedAPIErrors\n annotations:\n description: Kubernetes aggregated\ \ API {{ $labels.instance }}/{{ $labels.name\n }} has reported {{ $labels.reason\ \ }} errors on cluster {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors\n\ \ summary: Kubernetes aggregated API has reported errors.\n expr: sum\ \ by (cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job=\"\ apiserver\"}[1m]))\n > 0\n for: 10m\n labels:\n severity: warning\n\ \ - alert: KubeAggregatedAPIDown\n annotations:\n description: Kubernetes\ \ aggregated API {{ $labels.name }}/{{ $labels.namespace\n }} has been\ \ only {{ $value | humanize }}% available over the last 10m on cluster\n \ \ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown\n\ \ summary: Kubernetes aggregated API is down.\n expr: (1 - max by (name,\ \ namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job=\"apiserver\"\ }[10m])))\n * 100 < 85\n for: 5m\n labels:\n severity: warning\n\ \ - alert: KubeAPIDown\n annotations:\n description: KubeAPI has disappeared\ \ from Prometheus target discovery.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown\n\ \ summary: Target disappeared from Prometheus target discovery.\n expr:\ \ absent(up{job=\"apiserver\"})\n for: 15m\n labels:\n severity: critical\n\ \ - alert: KubeAPITerminatedRequests\n annotations:\n description: The\ \ kubernetes apiserver has terminated {{ $value | humanizePercentage\n \ \ }} of its incoming requests on cluster {{ $labels.cluster }}.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests\n\ \ summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage\n\ \ }} of its incoming requests.\n expr: sum by (cluster) (rate(apiserver_request_terminations_total{job=\"\ apiserver\"}[10m]))\n / ( sum by (cluster) (rate(apiserver_request_total{job=\"\ apiserver\"}[10m])) +\n sum by (cluster) (rate(apiserver_request_terminations_total{job=\"\ apiserver\"}[10m]))\n ) > 0.20\n for: 5m\n labels:\n severity:\ \ warning\n" monitoring-kube-prometheus-stack-kubernetes-system-controller-manager-965b603b-05c3-4e36-9b70-30adcbb5400d.yaml: "groups:\n\ - name: kubernetes-system-controller-manager\n rules:\n - alert: KubeControllerManagerDown\n\ \ annotations:\n description: KubeControllerManager has disappeared from\ \ Prometheus target discovery.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown\n\ \ summary: Target disappeared from Prometheus target discovery.\n expr:\ \ absent(up{job=\"kube-controller-manager\"})\n for: 15m\n labels:\n \ \ severity: critical\n" monitoring-kube-prometheus-stack-kubernetes-system-kube-proxy-b946dd15-5f3f-490e-a788-25134043fbfb.yaml: "groups:\n\ - name: kubernetes-system-kube-proxy\n rules:\n - alert: KubeProxyDown\n \ \ annotations:\n description: KubeProxy has disappeared from Prometheus target\ \ discovery.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeproxydown\n\ \ summary: Target disappeared from Prometheus target discovery.\n expr:\ \ absent(up{job=\"kube-proxy\"})\n for: 15m\n labels:\n severity: critical\n" monitoring-kube-prometheus-stack-kubernetes-system-kubelet-3f5c198a-b883-4aa8-8f72-3001b24a1138.yaml: "groups:\n\ - name: kubernetes-system-kubelet\n rules:\n - alert: KubeNodeNotReady\n \ \ annotations:\n description: '{{ $labels.node }} has been unready for more\ \ than 15 minutes on\n cluster {{ $labels.cluster }}.'\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready\n\ \ summary: Node is not ready.\n expr: |-\n kube_node_status_condition{job=\"\ kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n and on (cluster,\ \ node)\n kube_node_spec_unschedulable{job=\"kube-state-metrics\"} == 0\n\ \ for: 15m\n labels:\n severity: warning\n - alert: KubeNodePressure\n\ \ annotations:\n description: '{{ $labels.node }} on cluster {{ $labels.cluster\ \ }} has active\n Condition {{ $labels.condition }}. This is caused by\ \ resource usage exceeding\n eviction thresholds.'\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure\n\ \ summary: Node has as active Condition.\n expr: |-\n kube_node_status_condition{job=\"\ kube-state-metrics\",condition=~\"(MemoryPressure|DiskPressure|PIDPressure)\"\ ,status=\"true\"} == 1\n and on (cluster, node)\n kube_node_spec_unschedulable{job=\"\ kube-state-metrics\"} == 0\n for: 10m\n labels:\n severity: info\n\ \ - alert: KubeNodeUnreachable\n annotations:\n description: '{{ $labels.node\ \ }} is unreachable and some workloads may be rescheduled\n on cluster\ \ {{ $labels.cluster }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable\n\ \ summary: Node is unreachable.\n expr: (kube_node_spec_taint{job=\"kube-state-metrics\"\ ,key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"}\n unless ignoring(key,value)\ \ kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"\ })\n == 1\n for: 15m\n labels:\n severity: warning\n - alert:\ \ KubeletTooManyPods\n annotations:\n description: Kubelet '{{ $labels.node\ \ }}' is running at {{ $value | humanizePercentage\n }} of its Pod capacity\ \ on cluster {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods\n\ \ summary: Kubelet is running at capacity.\n expr: |-\n (\n \ \ max by (cluster, instance) (\n kubelet_running_pods{job=\"kubelet\"\ , metrics_path=\"/metrics\"} > 1\n )\n * on (cluster, instance)\ \ group_left(node)\n max by (cluster, instance, node) (\n kubelet_node_name{job=\"\ kubelet\", metrics_path=\"/metrics\"}\n )\n )\n / on (cluster,\ \ node) group_left()\n max by (cluster, node) (\n kube_node_status_capacity{job=\"\ kube-state-metrics\", resource=\"pods\"} != 1\n ) > 0.95\n for: 15m\n\ \ labels:\n severity: info\n - alert: KubeNodeReadinessFlapping\n \ \ annotations:\n description: The readiness status of node {{ $labels.node\ \ }} has changed {{\n $value }} times in the last 15 minutes on cluster\ \ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping\n\ \ summary: Node readiness status is flapping.\n expr: |-\n sum(changes(kube_node_status_condition{job=\"\ kube-state-metrics\",status=\"true\",condition=\"Ready\"}[15m])) by (cluster,\ \ node) > 2\n and on (cluster, node)\n kube_node_spec_unschedulable{job=\"\ kube-state-metrics\"} == 0\n for: 15m\n labels:\n severity: warning\n\ \ - alert: KubeNodeEviction\n annotations:\n description: Node {{ $labels.node\ \ }} on {{ $labels.cluster }} is evicting Pods\n due to {{ $labels.eviction_signal\ \ }}. Eviction occurs when eviction thresholds\n are crossed, typically\ \ caused by Pods exceeding RAM/ephemeral-storage limits.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeeviction\n\ \ summary: Node is evicting pods.\n expr: |-\n sum(rate(kubelet_evictions{job=\"\ kubelet\", metrics_path=\"/metrics\"}[15m])) by (cluster, eviction_signal, instance)\n\ \ * on (cluster, instance) group_left(node)\n max by (cluster, instance,\ \ node) (\n kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"\ }\n )\n > 0\n for: 0s\n labels:\n severity: info\n - alert:\ \ KubeletPlegDurationHigh\n annotations:\n description: The Kubelet Pod\ \ Lifecycle Event Generator has a 99th percentile\n duration of {{ $value\ \ }} seconds on node {{ $labels.node }} on cluster {{\n $labels.cluster\ \ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh\n\ \ summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.\n\ \ expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"\ 0.99\"}\n >= 10\n for: 5m\n labels:\n severity: warning\n - alert:\ \ KubeletPodStartUpLatencyHigh\n annotations:\n description: Kubelet Pod\ \ startup 99th percentile latency is {{ $value }} seconds\n on node {{\ \ $labels.node }} on cluster {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh\n\ \ summary: Kubelet Pod startup latency is too high.\n expr: |-\n \ \ histogram_quantile(0.99,\n sum by (cluster, instance, le) (\n \ \ topk by (cluster, instance, le, operation_type) (1,\n rate(kubelet_pod_worker_duration_seconds_bucket{job=\"\ kubelet\", metrics_path=\"/metrics\"}[5m])\n )\n )\n )\n\ \ * on (cluster, instance) group_left(node)\n topk by (cluster, instance,\ \ node) (1,\n kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"\ }\n )\n > 60\n for: 15m\n labels:\n severity: warning\n \ \ - alert: KubeletClientCertificateExpiration\n annotations:\n description:\ \ Client certificate for Kubelet on node {{ $labels.node }} expires\n in\ \ {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration\n\ \ summary: Kubelet client certificate is about to expire.\n expr: kubelet_certificate_manager_client_ttl_seconds\ \ < 604800\n labels:\n severity: warning\n - alert: KubeletClientCertificateExpiration\n\ \ annotations:\n description: Client certificate for Kubelet on node {{\ \ $labels.node }} expires\n in {{ $value | humanizeDuration }} on cluster\ \ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration\n\ \ summary: Kubelet client certificate is about to expire.\n expr: kubelet_certificate_manager_client_ttl_seconds\ \ < 86400\n labels:\n severity: critical\n - alert: KubeletServerCertificateExpiration\n\ \ annotations:\n description: Server certificate for Kubelet on node {{\ \ $labels.node }} expires\n in {{ $value | humanizeDuration }} on cluster\ \ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration\n\ \ summary: Kubelet server certificate is about to expire.\n expr: kubelet_certificate_manager_server_ttl_seconds\ \ < 604800\n labels:\n severity: warning\n - alert: KubeletServerCertificateExpiration\n\ \ annotations:\n description: Server certificate for Kubelet on node {{\ \ $labels.node }} expires\n in {{ $value | humanizeDuration }} on cluster\ \ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration\n\ \ summary: Kubelet server certificate is about to expire.\n expr: kubelet_certificate_manager_server_ttl_seconds\ \ < 86400\n labels:\n severity: critical\n - alert: KubeletClientCertificateRenewalErrors\n\ \ annotations:\n description: Kubelet on node {{ $labels.node }} has failed\ \ to renew its client\n certificate ({{ $value | humanize }} errors in\ \ the last 5 minutes) on cluster\n {{ $labels.cluster }}.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors\n\ \ summary: Kubelet has failed to renew its client certificate.\n expr:\ \ increase(kubelet_certificate_manager_client_expiration_renew_errors[5m])\n \ \ > 0\n for: 15m\n labels:\n severity: warning\n - alert: KubeletServerCertificateRenewalErrors\n\ \ annotations:\n description: Kubelet on node {{ $labels.node }} has failed\ \ to renew its server\n certificate ({{ $value | humanize }} errors in\ \ the last 5 minutes) on cluster\n {{ $labels.cluster }}.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors\n\ \ summary: Kubelet has failed to renew its server certificate.\n expr:\ \ increase(kubelet_server_expiration_renew_errors[5m]) > 0\n for: 15m\n \ \ labels:\n severity: warning\n - alert: KubeletDown\n annotations:\n\ \ description: Kubelet has disappeared from Prometheus target discovery on\ \ cluster\n {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown\n\ \ summary: Target disappeared from Prometheus target discovery.\n expr:\ \ |-\n count by (cluster) (kube_node_info{job=\"kube-state-metrics\"})\n\ \ unless on (cluster)\n count by (cluster) (up{job=\"kubelet\", metrics_path=\"\ /metrics\"} == 1)\n for: 15m\n labels:\n severity: critical\n" monitoring-kube-prometheus-stack-kubernetes-system-scheduler-d7bc55b3-9301-4c17-81d7-76c4590104da.yaml: "groups:\n\ - name: kubernetes-system-scheduler\n rules:\n - alert: KubeSchedulerDown\n\ \ annotations:\n description: KubeScheduler has disappeared from Prometheus\ \ target discovery.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown\n\ \ summary: Target disappeared from Prometheus target discovery.\n expr:\ \ absent(up{job=\"kube-scheduler\"})\n for: 15m\n labels:\n severity:\ \ critical\n" monitoring-kube-prometheus-stack-node-exporter-bb0e2fd6-3e20-4883-9c47-3d8d2acb1ac3.yaml: "groups:\n\ - name: node-exporter\n rules:\n - alert: NodeFilesystemSpaceFillingUp\n \ \ annotations:\n description: Filesystem on {{ $labels.device }}, mounted\ \ on {{ $labels.mountpoint\n }}, at {{ $labels.instance }} has only {{\ \ printf \"%.2f\" $value }}% available\n space left and is filling up.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup\n\ \ summary: Filesystem is predicted to run out of space within the next 24\ \ hours.\n expr: |-\n (\n node_filesystem_avail_bytes{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} * 100 < 15\n and\n predict_linear(node_filesystem_avail_bytes{job=\"\ node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0\n and\n\ \ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} == 0\n )\n for: 1h\n labels:\n severity: warning\n - alert:\ \ NodeFilesystemSpaceFillingUp\n annotations:\n description: Filesystem\ \ on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n }}, at {{\ \ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available\n \ \ space left and is filling up fast.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup\n\ \ summary: Filesystem is predicted to run out of space within the next 4\ \ hours.\n expr: |-\n (\n node_filesystem_avail_bytes{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} * 100 < 10\n and\n predict_linear(node_filesystem_avail_bytes{job=\"\ node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0\n and\n\ \ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} == 0\n )\n for: 1h\n labels:\n severity: critical\n - alert:\ \ NodeFilesystemAlmostOutOfSpace\n annotations:\n description: Filesystem\ \ on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n }}, at {{\ \ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available\n \ \ space left.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace\n\ \ summary: Filesystem has less than 5% space left.\n expr: |-\n (\n\ \ node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} * 100 < 5\n and\n node_filesystem_readonly{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} == 0\n )\n for: 30m\n labels:\n \ \ severity: warning\n - alert: NodeFilesystemAlmostOutOfSpace\n annotations:\n\ \ description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n\ \ }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}%\ \ available\n space left.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace\n\ \ summary: Filesystem has less than 3% space left.\n expr: |-\n (\n\ \ node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} * 100 < 3\n and\n node_filesystem_readonly{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} == 0\n )\n for: 30m\n labels:\n \ \ severity: critical\n - alert: NodeFilesystemFilesFillingUp\n annotations:\n\ \ description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n\ \ }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}%\ \ available\n inodes left and is filling up.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup\n\ \ summary: Filesystem is predicted to run out of inodes within the next 24\ \ hours.\n expr: |-\n (\n node_filesystem_files_free{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} * 100 < 40\n and\n predict_linear(node_filesystem_files_free{job=\"\ node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0\n and\n\ \ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} == 0\n )\n for: 1h\n labels:\n severity: warning\n - alert:\ \ NodeFilesystemFilesFillingUp\n annotations:\n description: Filesystem\ \ on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n }}, at {{\ \ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available\n \ \ inodes left and is filling up fast.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup\n\ \ summary: Filesystem is predicted to run out of inodes within the next 4\ \ hours.\n expr: |-\n (\n node_filesystem_files_free{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} * 100 < 20\n and\n predict_linear(node_filesystem_files_free{job=\"\ node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0\n and\n\ \ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} == 0\n )\n for: 1h\n labels:\n severity: critical\n - alert:\ \ NodeFilesystemAlmostOutOfFiles\n annotations:\n description: Filesystem\ \ on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n }}, at {{\ \ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available\n \ \ inodes left.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles\n\ \ summary: Filesystem has less than 5% inodes left.\n expr: |-\n \ \ (\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"\ } * 100 < 5\n and\n node_filesystem_readonly{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} == 0\n )\n for: 1h\n labels:\n \ \ severity: warning\n - alert: NodeFilesystemAlmostOutOfFiles\n annotations:\n\ \ description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n\ \ }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}%\ \ available\n inodes left.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles\n\ \ summary: Filesystem has less than 3% inodes left.\n expr: |-\n \ \ (\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\ \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"\ } * 100 < 3\n and\n node_filesystem_readonly{job=\"node-exporter\"\ ,fstype!=\"\",mountpoint!=\"\"} == 0\n )\n for: 1h\n labels:\n \ \ severity: critical\n - alert: NodeNetworkReceiveErrs\n annotations:\n \ \ description: '{{ $labels.instance }} interface {{ $labels.device }} has\ \ encountered\n {{ printf \"%.0f\" $value }} receive errors in the last\ \ two minutes.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs\n\ \ summary: Network interface is reporting many receive errors.\n expr:\ \ rate(node_network_receive_errs_total{job=\"node-exporter\"}[2m]) / rate(node_network_receive_packets_total{job=\"\ node-exporter\"}[2m])\n > 0.01\n for: 1h\n labels:\n severity:\ \ warning\n - alert: NodeNetworkTransmitErrs\n annotations:\n description:\ \ '{{ $labels.instance }} interface {{ $labels.device }} has encountered\n \ \ {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.'\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs\n\ \ summary: Network interface is reporting many transmit errors.\n expr:\ \ rate(node_network_transmit_errs_total{job=\"node-exporter\"}[2m]) / rate(node_network_transmit_packets_total{job=\"\ node-exporter\"}[2m])\n > 0.01\n for: 1h\n labels:\n severity:\ \ warning\n - alert: NodeHighNumberConntrackEntriesUsed\n annotations:\n \ \ description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of\ \ conntrack\n entries are used.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused\n\ \ summary: Number of conntrack are getting close to the limit.\n expr:\ \ (node_nf_conntrack_entries{job=\"node-exporter\"} / node_nf_conntrack_entries_limit)\n\ \ > 0.75\n labels:\n severity: warning\n - alert: NodeTextFileCollectorScrapeError\n\ \ annotations:\n description: Node Exporter text file collector on {{\ \ $labels.instance }} failed\n to scrape.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror\n\ \ summary: Node Exporter text file collector failed to scrape.\n expr:\ \ node_textfile_scrape_error{job=\"node-exporter\"} == 1\n labels:\n severity:\ \ warning\n - alert: NodeClockSkewDetected\n annotations:\n description:\ \ Clock at {{ $labels.instance }} is out of sync by more than 0.05s.\n \ \ Ensure NTP is configured correctly on this host.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected\n\ \ summary: Clock skew detected.\n expr: |-\n (\n node_timex_offset_seconds{job=\"\ node-exporter\"} > 0.05\n and\n deriv(node_timex_offset_seconds{job=\"\ node-exporter\"}[5m]) >= 0\n )\n or\n (\n node_timex_offset_seconds{job=\"\ node-exporter\"} < -0.05\n and\n deriv(node_timex_offset_seconds{job=\"\ node-exporter\"}[5m]) <= 0\n )\n for: 10m\n labels:\n severity:\ \ warning\n - alert: NodeClockNotSynchronising\n annotations:\n description:\ \ Clock at {{ $labels.instance }} is not synchronising. Ensure NTP\n is\ \ configured on this host.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising\n\ \ summary: Clock not synchronising.\n expr: |-\n min_over_time(node_timex_sync_status{job=\"\ node-exporter\"}[5m]) == 0\n and\n node_timex_maxerror_seconds{job=\"\ node-exporter\"} >= 16\n for: 10m\n labels:\n severity: warning\n \ \ - alert: NodeRAIDDegraded\n annotations:\n description: RAID array '{{\ \ $labels.device }}' at {{ $labels.instance }} is\n in degraded state due\ \ to one or more disks failures. Number of spare drives\n is insufficient\ \ to fix issue automatically.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded\n\ \ summary: RAID Array is degraded.\n expr: node_md_disks_required{job=\"\ node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\ }\n - ignoring (state) (node_md_disks{state=\"active\",job=\"node-exporter\"\ ,device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\ })\n > 0\n for: 15m\n labels:\n severity: critical\n - alert:\ \ NodeRAIDDiskFailure\n annotations:\n description: At least one device\ \ in RAID array at {{ $labels.instance }} failed.\n Array '{{ $labels.device\ \ }}' needs attention and possibly a disk swap.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure\n\ \ summary: Failed device in RAID array.\n expr: node_md_disks{state=\"\ failed\",job=\"node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\ }\n > 0\n labels:\n severity: warning\n - alert: NodeFileDescriptorLimit\n\ \ annotations:\n description: File descriptors limit at {{ $labels.instance\ \ }} is currently at\n {{ printf \"%.2f\" $value }}%.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit\n\ \ summary: Kernel is predicted to exhaust file descriptors limit soon.\n\ \ expr: |-\n (\n node_filefd_allocated{job=\"node-exporter\"} *\ \ 100 / node_filefd_maximum{job=\"node-exporter\"} > 70\n )\n for: 15m\n\ \ labels:\n severity: warning\n - alert: NodeFileDescriptorLimit\n \ \ annotations:\n description: File descriptors limit at {{ $labels.instance\ \ }} is currently at\n {{ printf \"%.2f\" $value }}%.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit\n\ \ summary: Kernel is predicted to exhaust file descriptors limit soon.\n\ \ expr: |-\n (\n node_filefd_allocated{job=\"node-exporter\"} *\ \ 100 / node_filefd_maximum{job=\"node-exporter\"} > 90\n )\n for: 15m\n\ \ labels:\n severity: critical\n - alert: NodeCPUHighUsage\n annotations:\n\ \ description: |\n CPU usage at {{ $labels.instance }} has been above\ \ 90% for the last 15 minutes, is currently at {{ printf \"%.2f\" $value }}%.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage\n\ \ summary: High CPU usage.\n expr: sum without(mode) (avg without (cpu)\ \ (rate(node_cpu_seconds_total{job=\"node-exporter\",\n mode!~\"idle|iowait\"\ }[2m]))) * 100 > 90\n for: 15m\n labels:\n severity: info\n - alert:\ \ NodeSystemSaturation\n annotations:\n description: |\n System\ \ load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes,\ \ is currently at {{ printf \"%.2f\" $value }}.\n This might indicate this\ \ instance resources saturation and can cause it becoming unresponsive.\n \ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation\n\ \ summary: System saturated, load per core is very high.\n expr: |-\n\ \ node_load1{job=\"node-exporter\"}\n / count without (cpu, mode) (node_cpu_seconds_total{job=\"\ node-exporter\", mode=\"idle\"}) > 2\n for: 15m\n labels:\n severity:\ \ warning\n - alert: NodeMemoryMajorPagesFaults\n annotations:\n description:\ \ |\n Memory major pages are occurring at very high rate at {{ $labels.instance\ \ }}, 500 major page faults per second for the last 15 minutes, is currently at\ \ {{ printf \"%.2f\" $value }}.\n Please check that there is enough memory\ \ available at this instance.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults\n\ \ summary: Memory major page faults are occurring at very high rate.\n \ \ expr: rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m]) > 500\n for:\ \ 15m\n labels:\n severity: warning\n - alert: NodeMemoryHighUtilization\n\ \ annotations:\n description: |\n Memory is filling up at {{ $labels.instance\ \ }}, has been above 90% for the last 15 minutes, is currently at {{ printf \"\ %.2f\" $value }}%.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization\n\ \ summary: Host is running out of memory.\n expr: 100 - (node_memory_MemAvailable_bytes{job=\"\ node-exporter\"} / node_memory_MemTotal_bytes{job=\"node-exporter\"}\n *\ \ 100) > 90\n for: 15m\n labels:\n severity: warning\n - alert: NodeDiskIOSaturation\n\ \ annotations:\n description: |\n Disk IO queue (aqu-sq) is high\ \ on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the\ \ last 30 minutes, is currently at {{ printf \"%.2f\" $value }}.\n This\ \ symptom might indicate disk saturation.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation\n\ \ summary: Disk IO queue is high.\n expr: rate(node_disk_io_time_weighted_seconds_total{job=\"\ node-exporter\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\ }[5m])\n > 10\n for: 30m\n labels:\n severity: warning\n - alert:\ \ NodeSystemdServiceFailed\n annotations:\n description: Systemd service\ \ {{ $labels.name }} has entered failed state at\n {{ $labels.instance\ \ }}\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed\n\ \ summary: Systemd service has entered failed state.\n expr: node_systemd_unit_state{job=\"\ node-exporter\", state=\"failed\"} == 1\n for: 5m\n labels:\n severity:\ \ warning\n - alert: NodeSystemdServiceCrashlooping\n annotations:\n \ \ description: Systemd service {{ $labels.name }} has being restarted too many\n\ \ times at {{ $labels.instance }} for the last 15 minutes. Please check\ \ if service\n is crash looping.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicecrashlooping\n\ \ summary: Systemd service keeps restaring, possibly crash looping.\n \ \ expr: increase(node_systemd_service_restart_total{job=\"node-exporter\"}[5m])\ \ >\n 2\n for: 15m\n labels:\n severity: warning\n - alert: NodeBondingDegraded\n\ \ annotations:\n description: Bonding interface {{ $labels.master }} on\ \ {{ $labels.instance }}\n is in degraded state due to one or more slave\ \ failures.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded\n\ \ summary: Bonding interface is degraded.\n expr: (node_bonding_slaves{job=\"\ node-exporter\"} - node_bonding_active{job=\"node-exporter\"})\n != 0\n \ \ for: 5m\n labels:\n severity: warning\n" monitoring-kube-prometheus-stack-node-exporter.rules-501fd5f0-0366-455a-80cc-5e208856f211.yaml: "groups:\n\ - name: node-exporter.rules\n rules:\n - expr: |-\n count without (cpu,\ \ mode) (\n node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"\ }\n )\n record: instance:node_num_cpu:sum\n - expr: |-\n 1 - avg\ \ without (cpu) (\n sum without (mode) (rate(node_cpu_seconds_total{job=\"\ node-exporter\", mode=~\"idle|iowait|steal\"}[5m]))\n )\n record: instance:node_cpu_utilisation:rate5m\n\ \ - expr: |-\n (\n node_load1{job=\"node-exporter\"}\n /\n \ \ instance:node_num_cpu:sum{job=\"node-exporter\"}\n )\n record:\ \ instance:node_load1_per_cpu:ratio\n - expr: |-\n 1 - (\n (\n \ \ node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n or\n\ \ (\n node_memory_Buffers_bytes{job=\"node-exporter\"}\n \ \ +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n \ \ +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n\ \ +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n \ \ )\n )\n /\n node_memory_MemTotal_bytes{job=\"node-exporter\"\ }\n )\n record: instance:node_memory_utilisation:ratio\n - expr: rate(node_vmstat_pgmajfault{job=\"\ node-exporter\"}[5m])\n record: instance:node_vmstat_pgmajfault:rate5m\n -\ \ expr: rate(node_disk_io_time_seconds_total{job=\"node-exporter\", device=~\"\ (/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[5m])\n\ \ record: instance_device:node_disk_io_time_seconds:rate5m\n - expr: rate(node_disk_io_time_weighted_seconds_total{job=\"\ node-exporter\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\ }[5m])\n record: instance_device:node_disk_io_time_weighted_seconds:rate5m\n\ \ - expr: |-\n sum without (device) (\n rate(node_network_receive_bytes_total{job=\"\ node-exporter\", device!=\"lo\"}[5m])\n )\n record: instance:node_network_receive_bytes_excluding_lo:rate5m\n\ \ - expr: |-\n sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"\ node-exporter\", device!=\"lo\"}[5m])\n )\n record: instance:node_network_transmit_bytes_excluding_lo:rate5m\n\ \ - expr: |-\n sum without (device) (\n rate(node_network_receive_drop_total{job=\"\ node-exporter\", device!=\"lo\"}[5m])\n )\n record: instance:node_network_receive_drop_excluding_lo:rate5m\n\ \ - expr: |-\n sum without (device) (\n rate(node_network_transmit_drop_total{job=\"\ node-exporter\", device!=\"lo\"}[5m])\n )\n record: instance:node_network_transmit_drop_excluding_lo:rate5m\n\ \ - expr: |-\n sum without (device) (\n rate(node_network_receive_bytes_total{job=\"\ node-exporter\", device!~\"lo|veth.+\"}[5m])\n )\n record: instance:node_network_receive_bytes_physical:rate5m\n\ \ - expr: |-\n sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"\ node-exporter\", device!~\"lo|veth.+\"}[5m])\n )\n record: instance:node_network_transmit_bytes_physical:rate5m\n\ \ - expr: |-\n sum without (device) (\n rate(node_network_receive_drop_total{job=\"\ node-exporter\", device!~\"lo|veth.+\"}[5m])\n )\n record: instance:node_network_receive_drop_physical:rate5m\n\ \ - expr: |-\n sum without (device) (\n rate(node_network_transmit_drop_total{job=\"\ node-exporter\", device!~\"lo|veth.+\"}[5m])\n )\n record: instance:node_network_transmit_drop_physical:rate5m\n" monitoring-kube-prometheus-stack-node-network-6268a5b5-2d1a-4ed0-b8b3-a03a12b3390d.yaml: "groups:\n\ - name: node-network\n rules:\n - alert: NodeNetworkInterfaceFlapping\n annotations:\n\ \ description: Network interface \"{{ $labels.device }}\" changing its up\ \ status\n often on node-exporter {{ $labels.namespace }}/{{ $labels.pod\ \ }}\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping\n\ \ summary: Network interface is often changing its status\n expr: changes(node_network_up{job=\"\ node-exporter\",device!~\"veth.+\"}[2m]) > 2\n for: 2m\n labels:\n \ \ severity: warning\n" monitoring-kube-prometheus-stack-node.rules-a9b43f41-87eb-4fea-856f-1f513c9d1ee3.yaml: "groups:\n\ - name: node.rules\n rules:\n - expr: |-\n topk by (cluster, namespace,\ \ pod) (1,\n max by (cluster, node, namespace, pod) (\n label_replace(kube_pod_info{job=\"\ kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n ))\n\ \ record: 'node_namespace_pod:kube_pod_info:'\n - expr: |-\n count by\ \ (cluster, node) (\n node_cpu_seconds_total{mode=\"idle\",job=\"node-exporter\"\ }\n * on (cluster, namespace, pod) group_left(node)\n topk by (cluster,\ \ namespace, pod) (1, node_namespace_pod:kube_pod_info:)\n )\n record:\ \ node:node_num_cpu:sum\n - expr: |-\n sum(\n node_memory_MemAvailable_bytes{job=\"\ node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"\ } +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"\ node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n\ \ )\n ) by (cluster)\n record: :node_memory_MemAvailable_bytes:sum\n\ \ - expr: |-\n avg by (cluster, node) (\n sum without (mode) (\n\ \ rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"\ steal\",job=\"node-exporter\"}[5m])\n )\n )\n record: node:node_cpu_utilization:ratio_rate5m\n\ \ - expr: |-\n avg by (cluster) (\n node:node_cpu_utilization:ratio_rate5m\n\ \ )\n record: cluster:node_cpu:ratio_rate5m\n" monitoring-kube-prometheus-stack-prometheus-7347b3a7-f0fa-4d43-bd9e-c3f35a7087b9.yaml: "groups:\n\ - name: prometheus\n rules:\n - alert: PrometheusBadConfig\n annotations:\n\ \ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed\ \ to\n reload its configuration.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig\n\ \ summary: Failed Prometheus configuration reload.\n expr: |-\n #\ \ Without max_over_time, failed scrapes could create false negatives, see\n \ \ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for\ \ details.\n max_over_time(prometheus_config_last_reload_successful{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]) == 0\n for:\ \ 10m\n labels:\n severity: critical\n - alert: PrometheusSDRefreshFailure\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ has failed to\n refresh SD with mechanism {{$labels.mechanism}}.\n \ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure\n\ \ summary: Failed Prometheus SD refresh.\n expr: increase(prometheus_sd_refresh_failures_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[10m])\n > 0\n\ \ for: 20m\n labels:\n severity: warning\n - alert: PrometheusKubernetesListWatchFailures\n\ \ annotations:\n description: Kubernetes service discovery of Prometheus\ \ {{$labels.namespace}}/{{$labels.pod}}\n is experiencing {{ printf \"\ %.0f\" $value }} failures with LIST/WATCH requests\n to the Kubernetes\ \ API in the last 5 minutes.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuskuberneteslistwatchfailures\n\ \ summary: Requests in Kubernetes SD are failing.\n expr: increase(prometheus_sd_kubernetes_failures_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusNotificationQueueRunningFull\n\ \ annotations:\n description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}\n\ \ is running full.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull\n\ \ summary: Prometheus alert notification queue predicted to run full in less\ \ than\n 30m.\n expr: |-\n # Without min_over_time, failed scrapes\ \ could create false negatives, see\n # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\ \ for details.\n (\n predict_linear(prometheus_notifications_queue_length{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m], 60 * 30)\n \ \ >\n min_over_time(prometheus_notifications_queue_capacity{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n )\n \ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers\n\ \ annotations:\n description: '{{ printf \"%.1f\" $value }}% of alerts\ \ sent by Prometheus {{$labels.namespace}}/{{$labels.pod}}\n to Alertmanager\ \ {{$labels.alertmanager}} were affected by errors.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers\n\ \ summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager\n\ \ were affected by errors.\n expr: |-\n (\n rate(prometheus_notifications_errors_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n /\n \ \ rate(prometheus_notifications_sent_total{job=\"kube-prometheus-stack-prometheus\"\ ,namespace=\"monitoring\"}[5m])\n )\n * 100\n > 1\n for: 15m\n\ \ labels:\n severity: warning\n - alert: PrometheusNotConnectedToAlertmanagers\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ is not connected\n to any Alertmanagers.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers\n\ \ summary: Prometheus is not connected to any Alertmanagers.\n expr: |-\n\ \ # Without max_over_time, failed scrapes could create false negatives, see\n\ \ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\ \ for details.\n max_over_time(prometheus_notifications_alertmanagers_discovered{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]) < 1\n for:\ \ 10m\n labels:\n severity: warning\n - alert: PrometheusTSDBReloadsFailing\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ has detected {{$value\n | humanize}} reload failures over the last 3h.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing\n\ \ summary: Prometheus has issues reloading blocks from disk.\n expr: increase(prometheus_tsdb_reloads_failures_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[3h])\n > 0\n\ \ for: 4h\n labels:\n severity: warning\n - alert: PrometheusTSDBCompactionsFailing\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ has detected {{$value\n | humanize}} compaction failures over the last\ \ 3h.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing\n\ \ summary: Prometheus has issues compacting blocks.\n expr: increase(prometheus_tsdb_compactions_failed_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[3h])\n > 0\n\ \ for: 4h\n labels:\n severity: warning\n - alert: PrometheusNotIngestingSamples\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ is not ingesting\n samples.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples\n\ \ summary: Prometheus is not ingesting samples.\n expr: |-\n (\n\ \ sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])) <= 0\n \ \ and\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}) > 0\n or\n\ \ sum without(rule_group) (prometheus_rule_group_rules{job=\"kube-prometheus-stack-prometheus\"\ ,namespace=\"monitoring\"}) > 0\n )\n )\n for: 10m\n labels:\n\ \ severity: warning\n - alert: PrometheusDuplicateTimestamps\n annotations:\n\ \ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping\ \ {{\n printf \"%.4g\" $value }} samples/s with different values but duplicated\ \ timestamp.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps\n\ \ summary: Prometheus is dropping samples with duplicate timestamps.\n \ \ expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\ \ for: 10m\n labels:\n severity: warning\n - alert: PrometheusOutOfOrderTimestamps\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ is dropping {{\n printf \"%.4g\" $value }} samples/s with timestamps\ \ arriving out of order.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps\n\ \ summary: Prometheus drops samples with out-of-order timestamps.\n expr:\ \ rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"kube-prometheus-stack-prometheus\"\ ,namespace=\"monitoring\"}[5m])\n > 0\n for: 10m\n labels:\n severity:\ \ warning\n - alert: PrometheusRemoteStorageFailures\n annotations:\n \ \ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send\n\ \ {{ printf \"%.1f\" $value }}% of the samples to {{ $labels.remote_name}}:{{\n\ \ $labels.url }}\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures\n\ \ summary: Prometheus fails to send samples to remote storage.\n expr:\ \ |-\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]))\n /\n\ \ (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]))\n +\n\ \ (rate(prometheus_remote_storage_succeeded_samples_total{job=\"kube-prometheus-stack-prometheus\"\ ,namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]))\n )\n\ \ )\n * 100\n > 1\n for: 15m\n labels:\n severity: critical\n\ \ - alert: PrometheusRemoteWriteBehind\n annotations:\n description:\ \ Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is\n {{\ \ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url\n\ \ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind\n\ \ summary: Prometheus remote write is behind.\n expr: |-\n # Without\ \ max_over_time, failed scrapes could create false negatives, see\n # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\ \ for details.\n (\n max_over_time(prometheus_remote_storage_queue_highest_timestamp_seconds{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n -\n \ \ max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n )\n \ \ > 120\n for: 15m\n labels:\n severity: critical\n - alert: PrometheusRemoteWriteDesiredShards\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ remote write desired\n shards calculation wants to run {{ $value }} shards\ \ for queue {{ $labels.remote_name}}:{{\n $labels.url }}, which is more\ \ than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"\ %s\",job=\"kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}`\n \ \ $labels.instance | query | first | value }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards\n\ \ summary: Prometheus remote write desired shards calculation wants to run\ \ more\n than configured max shards.\n expr: |-\n # Without max_over_time,\ \ failed scrapes could create false negatives, see\n # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\ \ for details.\n (\n max_over_time(prometheus_remote_storage_shards_desired{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n >\n \ \ max_over_time(prometheus_remote_storage_shards_max{job=\"kube-prometheus-stack-prometheus\"\ ,namespace=\"monitoring\"}[5m])\n )\n for: 15m\n labels:\n severity:\ \ warning\n - alert: PrometheusRuleFailures\n annotations:\n description:\ \ Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to\n evaluate\ \ {{ printf \"%.0f\" $value }} rules in the last 5m.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures\n\ \ summary: Prometheus is failing rule evaluations.\n expr: increase(prometheus_rule_evaluation_failures_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\ \ for: 15m\n labels:\n severity: critical\n - alert: PrometheusMissingRuleEvaluations\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ has missed {{\n printf \"%.0f\" $value }} rule group evaluations in the\ \ last 5m.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations\n\ \ summary: Prometheus is missing rule evaluations due to slow rule group\ \ evaluation.\n expr: increase(prometheus_rule_group_iterations_missed_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusTargetLimitHit\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ has dropped {{\n printf \"%.0f\" $value }} targets because the number\ \ of targets exceeded the\n configured target_limit.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit\n\ \ summary: Prometheus has dropped targets because some scrape configs have\ \ exceeded\n the targets limit.\n expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusLabelLimitHit\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ has dropped {{\n printf \"%.0f\" $value }} targets because some samples\ \ exceeded the configured\n label_limit, label_name_length_limit or label_value_length_limit.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit\n\ \ summary: Prometheus has dropped targets because some scrape configs have\ \ exceeded\n the labels limit.\n expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusScrapeBodySizeLimitHit\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ has failed {{\n printf \"%.0f\" $value }} scrapes in the last 5m because\ \ some targets exceeded\n the configured body_size_limit.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit\n\ \ summary: Prometheus has dropped some targets that exceeded body size limit.\n\ \ expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusScrapeSampleLimitHit\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ has failed {{\n printf \"%.0f\" $value }} scrapes in the last 5m because\ \ some targets exceeded\n the configured sample_limit.\n runbook_url:\ \ https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit\n\ \ summary: Prometheus has failed scrapes that have exceeded the configured\ \ sample\n limit.\n expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusTargetSyncFailure\n\ \ annotations:\n description: '{{ printf \"%.0f\" $value }} targets in\ \ Prometheus {{$labels.namespace}}/{{$labels.pod}}\n have failed to sync\ \ because invalid configuration was supplied.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure\n\ \ summary: Prometheus has failed to sync targets.\n expr: increase(prometheus_target_sync_failed_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[30m])\n > 0\n\ \ for: 5m\n labels:\n severity: critical\n - alert: PrometheusHighQueryLoad\n\ \ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ query API has\n less than 20% available capacity in its query engine\ \ for the last 15 minutes.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload\n\ \ summary: Prometheus is reaching its maximum capacity serving concurrent\ \ requests.\n expr: avg_over_time(prometheus_engine_queries{job=\"kube-prometheus-stack-prometheus\"\ ,namespace=\"monitoring\"}[5m])\n / max_over_time(prometheus_engine_queries_concurrent_max{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0.8\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusErrorSendingAlertsToAnyAlertmanager\n\ \ annotations:\n description: '{{ printf \"%.1f\" $value }}% minimum errors\ \ while sending alerts\n from Prometheus {{$labels.namespace}}/{{$labels.pod}}\ \ to any Alertmanager.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager\n\ \ summary: Prometheus encounters more than 3% errors sending alerts to any\ \ Alertmanager.\n expr: |-\n min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"\ kube-prometheus-stack-prometheus\",namespace=\"monitoring\",alertmanager!~``}[5m])\n\ \ /\n rate(prometheus_notifications_sent_total{job=\"kube-prometheus-stack-prometheus\"\ ,namespace=\"monitoring\",alertmanager!~``}[5m])\n )\n * 100\n \ \ > 3\n for: 15m\n labels:\n severity: critical\n" monitoring-kube-prometheus-stack-prometheus-operator-6e123f81-1bd4-4e2f-b8ae-c1287bf5cd37.yaml: "groups:\n\ - name: prometheus-operator\n rules:\n - alert: PrometheusOperatorListErrors\n\ \ annotations:\n description: Errors while performing List operations\ \ in controller {{$labels.controller}}\n in {{$labels.namespace}} namespace.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors\n\ \ summary: Errors while performing list operations in controller.\n expr:\ \ (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"\ kube-prometheus-stack-operator\",namespace=\"monitoring\"}[10m]))\n / sum\ \ by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job=\"\ kube-prometheus-stack-operator\",namespace=\"monitoring\"}[10m])))\n > 0.4\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusOperatorWatchErrors\n\ \ annotations:\n description: Errors while performing watch operations\ \ in controller {{$labels.controller}}\n in {{$labels.namespace}} namespace.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors\n\ \ summary: Errors while performing watch operations in controller.\n expr:\ \ (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"\ kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m]))\n / sum\ \ by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job=\"\ kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m])))\n > 0.4\n\ \ for: 15m\n labels:\n severity: warning\n - alert: PrometheusOperatorSyncFailed\n\ \ annotations:\n description: Controller {{ $labels.controller }} in {{\ \ $labels.namespace }}\n namespace fails to reconcile {{ $value }} objects.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed\n\ \ summary: Last controller reconciliation failed\n expr: min_over_time(prometheus_operator_syncs{status=\"\ failed\",job=\"kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m])\n\ \ > 0\n for: 10m\n labels:\n severity: warning\n - alert: PrometheusOperatorReconcileErrors\n\ \ annotations:\n description: '{{ $value | humanizePercentage }} of reconciling\ \ operations failed\n for {{ $labels.controller }} controller in {{ $labels.namespace\ \ }} namespace.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors\n\ \ summary: Errors while reconciling objects.\n expr: (sum by (cluster,controller,namespace)\ \ (rate(prometheus_operator_reconcile_errors_total{job=\"kube-prometheus-stack-operator\"\ ,namespace=\"monitoring\"}[5m])))\n / (sum by (cluster,controller,namespace)\ \ (rate(prometheus_operator_reconcile_operations_total{job=\"kube-prometheus-stack-operator\"\ ,namespace=\"monitoring\"}[5m])))\n > 0.1\n for: 10m\n labels:\n \ \ severity: warning\n - alert: PrometheusOperatorStatusUpdateErrors\n annotations:\n\ \ description: '{{ $value | humanizePercentage }} of status update operations\n\ \ failed for {{ $labels.controller }} controller in {{ $labels.namespace\ \ }}\n namespace.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorstatusupdateerrors\n\ \ summary: Errors while updating objects status.\n expr: (sum by (cluster,controller,namespace)\ \ (rate(prometheus_operator_status_update_errors_total{job=\"kube-prometheus-stack-operator\"\ ,namespace=\"monitoring\"}[5m])))\n / (sum by (cluster,controller,namespace)\ \ (rate(prometheus_operator_status_update_operations_total{job=\"kube-prometheus-stack-operator\"\ ,namespace=\"monitoring\"}[5m])))\n > 0.1\n for: 10m\n labels:\n \ \ severity: warning\n - alert: PrometheusOperatorNodeLookupErrors\n annotations:\n\ \ description: Errors while reconciling Prometheus in {{ $labels.namespace\ \ }}\n Namespace.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors\n\ \ summary: Errors while reconciling Prometheus.\n expr: rate(prometheus_operator_node_address_lookup_errors_total{job=\"\ kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m])\n > 0.1\n\ \ for: 10m\n labels:\n severity: warning\n - alert: PrometheusOperatorNotReady\n\ \ annotations:\n description: Prometheus operator in {{ $labels.namespace\ \ }} namespace isn't\n ready to reconcile {{ $labels.controller }} resources.\n\ \ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready\n\ \ summary: Prometheus operator not ready\n expr: min by (cluster,controller,namespace)\ \ (max_over_time(prometheus_operator_ready{job=\"kube-prometheus-stack-operator\"\ ,namespace=\"monitoring\"}[5m])\n == 0)\n for: 5m\n labels:\n \ \ severity: warning\n - alert: PrometheusOperatorRejectedResources\n annotations:\n\ \ description: Prometheus operator in {{ $labels.namespace }} namespace rejected\n\ \ {{ printf \"%0.0f\" $value }} {{ $labels.controller }}/{{ $labels.resource\ \ }}\n resources.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources\n\ \ summary: Resources rejected by Prometheus operator\n expr: min_over_time(prometheus_operator_managed_resources{state=\"\ rejected\",job=\"kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m])\n\ \ > 0\n for: 5m\n labels:\n severity: warning\n" kind: ConfigMap metadata: labels: app.kubernetes.io/managed-by: prometheus-operator managed-by: prometheus-operator prometheus-name: kube-prometheus-stack-prometheus name: prometheus-kube-prometheus-stack-prometheus-rulefiles-0 namespace: monitoring ownerReferences: - apiVersion: monitoring.coreos.com/v1 blockOwnerDeletion: true controller: true kind: Prometheus name: kube-prometheus-stack-prometheus uid: f0355616-4bfa-4409-8b5f-c1c815ee7a2a