Files
k8s-manifests/monitoring/configmap-prometheus-kube-prometheus-stack-prometheus-rulefiles-0.yaml
chemavx ff2e6cc985 feat: export all K8 Plus cluster manifests
Namespaces: argocd, authentik, backup-system, cloudflare-ddns,
gitea, homarr, monitoring, n8n, openclaw, polymarket-bot, vaultwarden
Cluster-wide: clusterissuers, namespaces
Secrets: redacted (structure only, data=REDACTED)
2026-04-10 08:57:02 +00:00

1706 lines
158 KiB
YAML

apiVersion: v1
data:
monitoring-kube-prometheus-stack-config-reloaders-eae692b3-e0b3-459e-8981-8dc6d7da6055.yaml: "groups:\n\
- name: config-reloaders\n rules:\n - alert: ConfigReloaderSidecarErrors\n \
\ annotations:\n description: |-\n Errors encountered while the\
\ {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}}\
\ namespace.\n As a result, configuration for service running in {{$labels.pod}}\
\ may be stale and cannot be updated anymore.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors\n\
\ summary: config-reloader sidecar has not had a successful reload for 10m\n\
\ expr: max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m])\
\ == 0\n for: 10m\n labels:\n severity: warning\n"
monitoring-kube-prometheus-stack-etcd-de0d66c3-becc-4bd9-8ab6-dff75f452f02.yaml: "groups:\n\
- name: etcd\n rules:\n - alert: etcdMembersDown\n annotations:\n description:\
\ 'etcd cluster \"{{ $labels.job }}\": members are down ({{ $value\n }}).'\n\
\ summary: etcd cluster members are down.\n expr: |-\n max without\
\ (endpoint) (\n sum without (instance, pod) (up{job=~\".*etcd.*\"} ==\
\ bool 0)\n or\n count without (To) (\n sum without (instance,\
\ pod) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s]))\
\ > 0.01\n )\n )\n > 0\n for: 20m\n labels:\n severity:\
\ warning\n - alert: etcdInsufficientMembers\n annotations:\n description:\
\ 'etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value\n \
\ }}).'\n summary: etcd cluster has insufficient number of members.\n \
\ expr: sum(up{job=~\".*etcd.*\"} == bool 1) without (instance, pod) < ((count(up{job=~\"\
.*etcd.*\"})\n without (instance, pod) + 1) / 2)\n for: 3m\n labels:\n\
\ severity: critical\n - alert: etcdNoLeader\n annotations:\n description:\
\ 'etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }}\n \
\ has no leader.'\n summary: etcd cluster has no leader.\n expr: etcd_server_has_leader{job=~\"\
.*etcd.*\"} == 0\n for: 1m\n labels:\n severity: critical\n - alert:\
\ etcdHighNumberOfLeaderChanges\n annotations:\n description: 'etcd cluster\
\ \"{{ $labels.job }}\": {{ $value }} leader changes\n within the last\
\ 15 minutes. Frequent elections may be a sign of insufficient\n resources,\
\ high network latency, or disruptions by other components and should\n \
\ be investigated.'\n summary: etcd cluster has high number of leader changes.\n\
\ expr: increase((max without (instance, pod) (etcd_server_leader_changes_seen_total{job=~\"\
.*etcd.*\"})\n or 0*absent(etcd_server_leader_changes_seen_total{job=~\"\
.*etcd.*\"}))[15m:1m])\n >= 4\n for: 5m\n labels:\n severity:\
\ warning\n - alert: etcdHighNumberOfFailedGRPCRequests\n annotations:\n \
\ description: 'etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests\
\ for\n {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance\
\ }}.'\n summary: etcd cluster has high number of failed grpc requests.\n\
\ expr: |-\n 100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\"\
, grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
}[5m])) without (grpc_type, grpc_code)\n /\n sum(rate(grpc_server_handled_total{job=~\"\
.*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 1\n for: 10m\n\
\ labels:\n severity: warning\n - alert: etcdHighNumberOfFailedGRPCRequests\n\
\ annotations:\n description: 'etcd cluster \"{{ $labels.job }}\": {{\
\ $value }}% of requests for\n {{ $labels.grpc_method }} failed on etcd\
\ instance {{ $labels.instance }}.'\n summary: etcd cluster has high number\
\ of failed grpc requests.\n expr: |-\n 100 * sum(rate(grpc_server_handled_total{job=~\"\
.*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
}[5m])) without (grpc_type, grpc_code)\n /\n sum(rate(grpc_server_handled_total{job=~\"\
.*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 5\n for: 5m\n \
\ labels:\n severity: critical\n - alert: etcdGRPCRequestsSlow\n annotations:\n\
\ description: 'etcd cluster \"{{ $labels.job }}\": 99th percentile of gRPC\
\ requests\n is {{ $value }}s on etcd instance {{ $labels.instance }} for\
\ {{ $labels.grpc_method\n }} method.'\n summary: etcd grpc requests\
\ are slow\n expr: |-\n histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\"\
.*etcd.*\", grpc_method!=\"Defragment\", grpc_type=\"unary\"}[5m])) without(grpc_type))\n\
\ > 0.15\n for: 10m\n labels:\n severity: critical\n - alert:\
\ etcdMemberCommunicationSlow\n annotations:\n description: 'etcd cluster\
\ \"{{ $labels.job }}\": member communication with {{\n $labels.To }} is\
\ taking {{ $value }}s on etcd instance {{ $labels.instance\n }}.'\n \
\ summary: etcd cluster member communication is slow.\n expr: |-\n \
\ histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\"\
.*etcd.*\"}[5m]))\n > 0.15\n for: 10m\n labels:\n severity: warning\n\
\ - alert: etcdHighNumberOfFailedProposals\n annotations:\n description:\
\ 'etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures\n \
\ within the last 30 minutes on etcd instance {{ $labels.instance }}.'\n \
\ summary: etcd cluster has high number of proposal failures.\n expr: rate(etcd_server_proposals_failed_total{job=~\"\
.*etcd.*\"}[15m]) > 5\n for: 15m\n labels:\n severity: warning\n -\
\ alert: etcdHighFsyncDurations\n annotations:\n description: 'etcd cluster\
\ \"{{ $labels.job }}\": 99th percentile fsync durations\n are {{ $value\
\ }}s on etcd instance {{ $labels.instance }}.'\n summary: etcd cluster 99th\
\ percentile fsync durations are too high.\n expr: |-\n histogram_quantile(0.99,\
\ rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n\
\ > 0.5\n for: 10m\n labels:\n severity: warning\n - alert: etcdHighFsyncDurations\n\
\ annotations:\n description: 'etcd cluster \"{{ $labels.job }}\": 99th\
\ percentile fsync durations\n are {{ $value }}s on etcd instance {{ $labels.instance\
\ }}.'\n summary: etcd cluster 99th percentile fsync durations are too high.\n\
\ expr: |-\n histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\"\
.*etcd.*\"}[5m]))\n > 1\n for: 10m\n labels:\n severity: critical\n\
\ - alert: etcdHighCommitDurations\n annotations:\n description: 'etcd\
\ cluster \"{{ $labels.job }}\": 99th percentile commit durations\n {{\
\ $value }}s on etcd instance {{ $labels.instance }}.'\n summary: etcd cluster\
\ 99th percentile commit durations are too high.\n expr: |-\n histogram_quantile(0.99,\
\ rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n\
\ > 0.25\n for: 10m\n labels:\n severity: warning\n - alert:\
\ etcdDatabaseQuotaLowSpace\n annotations:\n description: 'etcd cluster\
\ \"{{ $labels.job }}\": database size exceeds the defined\n quota on etcd\
\ instance {{ $labels.instance }}, please defrag or increase the\n quota\
\ as the writes to etcd will be disabled when it is full.'\n summary: etcd\
\ cluster database is running full.\n expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~\"\
.*etcd.*\"}[5m]) /\n last_over_time(etcd_server_quota_backend_bytes{job=~\"\
.*etcd.*\"}[5m]))*100 >\n 95\n for: 10m\n labels:\n severity:\
\ critical\n - alert: etcdExcessiveDatabaseGrowth\n annotations:\n description:\
\ 'etcd cluster \"{{ $labels.job }}\": Predicting running out of disk\n \
\ space in the next four hours, based on write observations within the past\n\
\ four hours on etcd instance {{ $labels.instance }}, please check as it\
\ might\n be disruptive.'\n summary: etcd cluster database growing\
\ very fast.\n expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~\"\
.*etcd.*\"}[4h], 4*60*60)\n > etcd_server_quota_backend_bytes{job=~\".*etcd.*\"\
}\n for: 10m\n labels:\n severity: warning\n - alert: etcdDatabaseHighFragmentationRatio\n\
\ annotations:\n description: 'etcd cluster \"{{ $labels.job }}\": database\
\ size in use on instance\n {{ $labels.instance }} is {{ $value | humanizePercentage\
\ }} of the actual\n allocated disk space, please run defragmentation (e.g.\
\ etcdctl defrag) to\n retrieve the unused fragmented disk space.'\n \
\ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation\n\
\ summary: etcd database size in use is less than 50% of the actual allocated\n\
\ storage.\n expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~\"\
.*etcd.*\"}[5m])\n / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~\"\
.*etcd.*\"}[5m])) < 0.5\n and etcd_mvcc_db_total_size_in_use_in_bytes{job=~\"\
.*etcd.*\"} > 104857600\n for: 10m\n labels:\n severity: warning\n"
monitoring-kube-prometheus-stack-general.rules-f627c7c8-ea4b-4b56-98b8-e667d6567e7b.yaml: "groups:\n\
- name: general.rules\n rules:\n - alert: TargetDown\n annotations:\n \
\ description: '{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service\n\
\ }} targets in {{ $labels.namespace }} namespace are down.'\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/general/targetdown\n \
\ summary: One or more targets are unreachable.\n expr: 100 * (count(up ==\
\ 0) BY (cluster, job, namespace, service) / count(up)\n BY (cluster, job,\
\ namespace, service)) > 10\n for: 10m\n labels:\n severity: warning\n\
\ - alert: Watchdog\n annotations:\n description: |\n This is\
\ an alert meant to ensure that the entire alerting pipeline is functional.\n\
\ This alert is always firing, therefore it should always be firing in\
\ Alertmanager\n and always fire against a receiver. There are integrations\
\ with various notification\n mechanisms that send a notification when\
\ this alert is not firing. For example the\n \"DeadMansSnitch\" integration\
\ in PagerDuty.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog\n\
\ summary: An alert that should always be firing to certify that Alertmanager\n\
\ is working properly.\n expr: vector(1)\n labels:\n severity:\
\ none\n - alert: InfoInhibitor\n annotations:\n description: |\n \
\ This is an alert that is used to inhibit info alerts.\n By themselves,\
\ the info-level alerts are sometimes very noisy, but they are relevant when combined\
\ with\n other alerts.\n This alert fires whenever there's a severity=\"\
info\" alert, and stops firing when another alert with a\n severity of\
\ 'warning' or 'critical' starts firing on the same namespace.\n This alert\
\ should be routed to a null receiver and configured to inhibit alerts with severity=\"\
info\".\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor\n\
\ summary: Info-level alert inhibition.\n expr: group by (namespace) (ALERTS{severity\
\ = \"info\"} == 1) unless on (namespace)\n group by (namespace) (ALERTS{alertname\
\ != \"InfoInhibitor\", alertstate = \"firing\",\n severity =~ \"warning|critical\"\
} == 1)\n labels:\n severity: none\n"
monitoring-kube-prometheus-stack-k8s.rules.container-cpu-usage-seconds-tot-715fb365-db24-4478-8fdf-40df40c31616.yaml: "groups:\n\
- name: k8s.rules.container_cpu_usage_seconds_total\n rules:\n - expr: |-\n\
\ sum by (cluster, namespace, pod, container) (\n rate(container_cpu_usage_seconds_total{job=\"\
kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}[5m])\n ) * on\
\ (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod)\
\ (\n 1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\
\"})\n )\n record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m\n\
\ - expr: |-\n sum by (cluster, namespace, pod, container) (\n irate(container_cpu_usage_seconds_total{job=\"\
kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}[5m])\n ) * on\
\ (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod)\
\ (\n 1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\
\"})\n )\n record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate\n"
monitoring-kube-prometheus-stack-k8s.rules.container-memory-cache-c548651c-95bc-4bc3-a2d8-6fb97abc9ec3.yaml: "groups:\n\
- name: k8s.rules.container_memory_cache\n rules:\n - expr: |-\n container_memory_cache{job=\"\
kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n * on (cluster,\
\ namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,\n \
\ max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n \
\ )\n record: node_namespace_pod_container:container_memory_cache\n"
monitoring-kube-prometheus-stack-k8s.rules.container-memory-rss-1baf12c0-1dce-4867-86be-91c1ee948313.yaml: "groups:\n\
- name: k8s.rules.container_memory_rss\n rules:\n - expr: |-\n container_memory_rss{job=\"\
kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n * on (cluster,\
\ namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,\n \
\ max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n \
\ )\n record: node_namespace_pod_container:container_memory_rss\n"
monitoring-kube-prometheus-stack-k8s.rules.container-memory-swap-dd2afe1d-3a0d-44ed-b97a-0fc8e29e111e.yaml: "groups:\n\
- name: k8s.rules.container_memory_swap\n rules:\n - expr: |-\n container_memory_swap{job=\"\
kubelet\", metrics_path=\"/metrics/cadvisor\", image!=\"\"}\n * on (cluster,\
\ namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,\n \
\ max by (cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n \
\ )\n record: node_namespace_pod_container:container_memory_swap\n"
monitoring-kube-prometheus-stack-k8s.rules.container-memory-working-set-by-b44b3b4f-1d0e-466a-9b08-a078ed6f1588.yaml: "groups:\n\
- name: k8s.rules.container_memory_working_set_bytes\n rules:\n - expr: |-\n\
\ container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"\
, image!=\"\"}\n * on (cluster, namespace, pod) group_left(node) topk by\
\ (cluster, namespace, pod) (1,\n max by (cluster, namespace, pod, node)\
\ (kube_pod_info{node!=\"\"})\n )\n record: node_namespace_pod_container:container_memory_working_set_bytes\n"
monitoring-kube-prometheus-stack-k8s.rules.container-resource-f779d497-1ce1-46cf-8234-c68eca5f1472.yaml: "groups:\n\
- name: k8s.rules.container_resource\n rules:\n - expr: |-\n kube_pod_container_resource_requests{resource=\"\
memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\n group_left()\
\ max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"\
Pending|Running\"} == 1)\n )\n record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests\n\
\ - expr: |-\n sum by (namespace, cluster) (\n sum by (namespace,\
\ pod, cluster) (\n max by (namespace, pod, container, cluster) (\n\
\ kube_pod_container_resource_requests{resource=\"memory\",job=\"\
kube-state-metrics\"}\n ) * on (namespace, pod, cluster) group_left()\
\ max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"\
Pending|Running\"} == 1\n )\n )\n )\n record: namespace_memory:kube_pod_container_resource_requests:sum\n\
\ - expr: |-\n kube_pod_container_resource_requests{resource=\"cpu\",job=\"\
kube-state-metrics\"} * on (namespace, pod, cluster)\n group_left() max\
\ by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"\
} == 1)\n )\n record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests\n\
\ - expr: |-\n sum by (namespace, cluster) (\n sum by (namespace,\
\ pod, cluster) (\n max by (namespace, pod, container, cluster) (\n\
\ kube_pod_container_resource_requests{resource=\"cpu\",job=\"\
kube-state-metrics\"}\n ) * on (namespace, pod, cluster) group_left()\
\ max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"\
Pending|Running\"} == 1\n )\n )\n )\n record: namespace_cpu:kube_pod_container_resource_requests:sum\n\
\ - expr: |-\n kube_pod_container_resource_limits{resource=\"memory\",job=\"\
kube-state-metrics\"} * on (namespace, pod, cluster)\n group_left() max\
\ by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"\
} == 1)\n )\n record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits\n\
\ - expr: |-\n sum by (namespace, cluster) (\n sum by (namespace,\
\ pod, cluster) (\n max by (namespace, pod, container, cluster) (\n\
\ kube_pod_container_resource_limits{resource=\"memory\",job=\"\
kube-state-metrics\"}\n ) * on (namespace, pod, cluster) group_left()\
\ max by (namespace, pod, cluster) (\n kube_pod_status_phase{phase=~\"\
Pending|Running\"} == 1\n )\n )\n )\n record: namespace_memory:kube_pod_container_resource_limits:sum\n\
\ - expr: |-\n kube_pod_container_resource_limits{resource=\"cpu\",job=\"\
kube-state-metrics\"} * on (namespace, pod, cluster)\n group_left() max\
\ by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"\
} == 1)\n )\n record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits\n\
\ - expr: |-\n sum by (namespace, cluster) (\n sum by (namespace,\
\ pod, cluster) (\n max by (namespace, pod, container, cluster) (\n\
\ kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"\
}\n ) * on (namespace, pod, cluster) group_left() max by (namespace,\
\ pod, cluster) (\n kube_pod_status_phase{phase=~\"Pending|Running\"\
} == 1\n )\n )\n )\n record: namespace_cpu:kube_pod_container_resource_limits:sum\n"
monitoring-kube-prometheus-stack-k8s.rules.pod-owner-6921c522-c42e-43f6-8c4c-296d6fd5994e.yaml: "groups:\n\
- name: k8s.rules.pod_owner\n rules:\n - expr: |-\n max by (cluster, namespace,\
\ workload, pod) (\n label_replace(\n label_replace(\n \
\ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n\
\ \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on\
\ (cluster, replicaset, namespace) group_left(owner_name) topk by (cluster, replicaset,\
\ namespace) (\n 1, max by (cluster, replicaset, namespace, owner_name)\
\ (\n kube_replicaset_owner{job=\"kube-state-metrics\", owner_kind=\"\
\"}\n )\n ),\n \"workload\", \"$1\", \"replicaset\"\
, \"(.*)\"\n )\n )\n labels:\n workload_type: replicaset\n\
\ record: namespace_workload_pod:kube_pod_owner:relabel\n - expr: |-\n \
\ max by (cluster, namespace, workload, pod) (\n label_replace(\n \
\ label_replace(\n kube_pod_owner{job=\"kube-state-metrics\"\
, owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\"\
, \"(.*)\"\n ) * on (replicaset, namespace, cluster) group_left(owner_name)\
\ topk by (cluster, replicaset, namespace) (\n 1, max by (cluster,\
\ replicaset, namespace, owner_name) (\n kube_replicaset_owner{job=\"\
kube-state-metrics\", owner_kind=\"Deployment\"}\n )\n ),\n\
\ \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n )\n\
\ labels:\n workload_type: deployment\n record: namespace_workload_pod:kube_pod_owner:relabel\n\
\ - expr: |-\n max by (cluster, namespace, workload, pod) (\n label_replace(\n\
\ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"\
},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n \
\ )\n labels:\n workload_type: daemonset\n record: namespace_workload_pod:kube_pod_owner:relabel\n\
\ - expr: |-\n max by (cluster, namespace, workload, pod) (\n label_replace(\n\
\ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"\
},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\")\n )\n labels:\n\
\ workload_type: statefulset\n record: namespace_workload_pod:kube_pod_owner:relabel\n\
\ - expr: |-\n group by (cluster, namespace, workload, pod) (\n label_join(\n\
\ group by (cluster, namespace, job_name, pod, owner_name) (\n \
\ label_join(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"\
Job\"}\n , \"job_name\", \"\", \"owner_name\")\n )\n \
\ * on (cluster, namespace, job_name) group_left()\n group by (cluster,\
\ namespace, job_name) (\n kube_job_owner{job=\"kube-state-metrics\"\
, owner_kind=~\"Pod|\"}\n )\n , \"workload\", \"\", \"owner_name\"\
)\n )\n labels:\n workload_type: job\n record: namespace_workload_pod:kube_pod_owner:relabel\n\
\ - expr: |-\n max by (cluster, namespace, workload, pod) (\n label_replace(\n\
\ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"\", owner_name=\"\
\"},\n \"workload\", \"$1\", \"pod\", \"(.+)\")\n )\n labels:\n\
\ workload_type: barepod\n record: namespace_workload_pod:kube_pod_owner:relabel\n\
\ - expr: |-\n max by (cluster, namespace, workload, pod) (\n label_replace(\n\
\ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"Node\"},\n\
\ \"workload\", \"$1\", \"pod\", \"(.+)\")\n )\n labels:\n \
\ workload_type: staticpod\n record: namespace_workload_pod:kube_pod_owner:relabel\n\
\ - expr: |-\n group by (cluster, namespace, workload, workload_type, pod)\
\ (\n label_join(\n label_join(\n group by (cluster,\
\ namespace, job_name, pod) (\n label_join(\n kube_pod_owner{job=\"\
kube-state-metrics\", owner_kind=\"Job\"}\n , \"job_name\", \"\"\
, \"owner_name\")\n )\n * on (cluster, namespace, job_name)\
\ group_left(owner_kind, owner_name)\n group by (cluster, namespace,\
\ job_name, owner_kind, owner_name) (\n kube_job_owner{job=\"kube-state-metrics\"\
, owner_kind!=\"Pod\", owner_kind!=\"\"}\n )\n , \"workload\"\
, \"\", \"owner_name\")\n , \"workload_type\", \"\", \"owner_kind\")\n\n\
\ OR\n\n label_replace(\n label_replace(\n label_replace(\n\
\ kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"\
}\n , \"replicaset\", \"$1\", \"owner_name\", \"(.+)\"\n \
\ )\n * on (cluster, namespace, replicaset) group_left(owner_kind,\
\ owner_name)\n group by (cluster, namespace, replicaset, owner_kind,\
\ owner_name) (\n kube_replicaset_owner{job=\"kube-state-metrics\"\
, owner_kind!=\"Deployment\", owner_kind!=\"\"}\n )\n , \"\
workload\", \"$1\", \"owner_name\", \"(.+)\")\n OR\n label_replace(\n\
\ group by (cluster, namespace, pod, owner_name, owner_kind) (\n \
\ kube_pod_owner{job=\"kube-state-metrics\", owner_kind!=\"ReplicaSet\"\
, owner_kind!=\"DaemonSet\", owner_kind!=\"StatefulSet\", owner_kind!=\"Job\"\
, owner_kind!=\"Node\", owner_kind!=\"\"}\n )\n , \"workload\"\
, \"$1\", \"owner_name\", \"(.+)\"\n )\n , \"workload_type\",\
\ \"$1\", \"owner_kind\", \"(.+)\")\n )\n record: namespace_workload_pod:kube_pod_owner:relabel\n"
monitoring-kube-prometheus-stack-kube-apiserver-availability.rules-fe1708b8-332c-4d9d-be30-1a6e49774f5a.yaml: "groups:\n\
- interval: 3m\n name: kube-apiserver-availability.rules\n rules:\n - expr:\
\ avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 *\n \
\ 30\n record: code_verb:apiserver_request_total:increase30d\n - expr:\
\ sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"\
LIST|GET\"})\n labels:\n verb: read\n record: code:apiserver_request_total:increase30d\n\
\ - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"\
POST|PUT|PATCH|DELETE\"})\n labels:\n verb: write\n record: code:apiserver_request_total:increase30d\n\
\ - expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))\n\
\ record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h\n\
\ - expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d])\n\
\ * 24 * 30)\n record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d\n\
\ - expr: sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le=\"\
+Inf\"})\n record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h\n\
\ - expr: sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le=\"\
+Inf\"})\n record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d\n\
\ - expr: |-\n 1 - (\n (\n # write too slow\n sum\
\ by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"\
POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
POST|PUT|PATCH|DELETE\",le=~\"1(\\\\.0)?\"} or vector(0))\n ) +\n \
\ (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"\
LIST|GET\"})\n -\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
LIST|GET\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"} or vector(0))\n \
\ +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
LIST|GET\",scope=\"namespace\",le=~\"5(\\\\.0)?\"} or vector(0))\n \
\ +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
LIST|GET\",scope=\"cluster\",le=~\"30(\\\\.0)?\"} or vector(0))\n )\n\
\ ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"\
5..\"} or vector(0))\n )\n /\n sum by (cluster) (code:apiserver_request_total:increase30d)\n\
\ labels:\n verb: all\n record: apiserver_request:availability30d\n\
\ - expr: |-\n 1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"\
LIST|GET\"})\n -\n (\n # too slow\n sum by (cluster)\
\ (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
LIST|GET\",scope=~\"resource|\",le=~\"1(\\\\.0)?\"} or vector(0))\n +\n\
\ sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
LIST|GET\",scope=\"namespace\",le=~\"5(\\\\.0)?\"} or vector(0))\n +\n\
\ sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
LIST|GET\",scope=\"cluster\",le=~\"30(\\\\.0)?\"} or vector(0))\n )\n \
\ +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"\
read\",code=~\"5..\"} or vector(0))\n )\n /\n sum by (cluster)\
\ (code:apiserver_request_total:increase30d{verb=\"read\"})\n labels:\n \
\ verb: read\n record: apiserver_request:availability30d\n - expr: |-\n\
\ 1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~\"\
POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~\"\
POST|PUT|PATCH|DELETE\",le=~\"1(\\\\.0)?\"} or vector(0))\n )\n \
\ +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"\
write\",code=~\"5..\"} or vector(0))\n )\n /\n sum by (cluster)\
\ (code:apiserver_request_total:increase30d{verb=\"write\"})\n labels:\n \
\ verb: write\n record: apiserver_request:availability30d\n - expr: sum\
\ by (cluster,code,resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"\
LIST|GET\"}[5m]))\n labels:\n verb: read\n record: code_resource:apiserver_request_total:rate5m\n\
\ - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n labels:\n verb: write\n\
\ record: code_resource:apiserver_request_total:rate5m\n - expr: sum by (cluster,\
\ code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\"\
,code=~\"2..\"}[1h]))\n record: code_verb:apiserver_request_total:increase1h\n\
\ - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job=\"\
apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\",code=~\"3..\"}[1h]))\n \
\ record: code_verb:apiserver_request_total:increase1h\n - expr: sum by (cluster,\
\ code, verb) (increase(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\"\
,code=~\"4..\"}[1h]))\n record: code_verb:apiserver_request_total:increase1h\n\
\ - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job=\"\
apiserver\",verb=~\"LIST|GET|POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n \
\ record: code_verb:apiserver_request_total:increase1h\n"
monitoring-kube-prometheus-stack-kube-apiserver-burnrate.rules-5b37d5cc-ec92-44ef-8b83-84bd7039e174.yaml: "groups:\n\
- name: kube-apiserver-burnrate.rules\n rules:\n - expr: |-\n (\n \
\ (\n # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[1d]))\n -\n (\n (\n sum by (cluster)\
\ (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\",verb=~\"\
LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\"\
,le=~\"1(\\\\.0)?\"}[1d]))\n or\n vector(0)\n \
\ )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[1d]))\n +\n sum\
\ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\
,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\
cluster\",le=~\"30(\\\\.0)?\"}[1d]))\n )\n )\n +\n \
\ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n )\n /\n sum by (cluster)\
\ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n\
\ labels:\n verb: read\n record: apiserver_request:burnrate1d\n -\
\ expr: |-\n (\n (\n # too slow\n sum by (cluster)\
\ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\
LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n \
\ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[1h]))\n or\n \
\ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[1h]))\n +\n sum\
\ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\
,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\
cluster\",le=~\"30(\\\\.0)?\"}[1h]))\n )\n )\n +\n \
\ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n )\n /\n sum by (cluster)\
\ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n\
\ labels:\n verb: read\n record: apiserver_request:burnrate1h\n -\
\ expr: |-\n (\n (\n # too slow\n sum by (cluster)\
\ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\
LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n \
\ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[2h]))\n or\n \
\ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[2h]))\n +\n sum\
\ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\
,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\
cluster\",le=~\"30(\\\\.0)?\"}[2h]))\n )\n )\n +\n \
\ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n )\n /\n sum by (cluster)\
\ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n\
\ labels:\n verb: read\n record: apiserver_request:burnrate2h\n -\
\ expr: |-\n (\n (\n # too slow\n sum by (cluster)\
\ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\
LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n \
\ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[30m]))\n or\n \
\ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[30m]))\n +\n sum\
\ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\
,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\
cluster\",le=~\"30(\\\\.0)?\"}[30m]))\n )\n )\n +\n \
\ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n )\n /\n \
\ sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"\
}[30m]))\n labels:\n verb: read\n record: apiserver_request:burnrate30m\n\
\ - expr: |-\n (\n (\n # too slow\n sum by (cluster)\
\ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\
LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n \
\ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[3d]))\n or\n \
\ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[3d]))\n +\n sum\
\ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\
,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\
cluster\",le=~\"30(\\\\.0)?\"}[3d]))\n )\n )\n +\n \
\ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n )\n /\n sum by (cluster)\
\ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n\
\ labels:\n verb: read\n record: apiserver_request:burnrate3d\n -\
\ expr: |-\n (\n (\n # too slow\n sum by (cluster)\
\ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\
LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n \
\ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[5m]))\n or\n \
\ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[5m]))\n +\n sum\
\ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\
,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\
cluster\",le=~\"30(\\\\.0)?\"}[5m]))\n )\n )\n +\n \
\ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n )\n /\n sum by (cluster)\
\ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n\
\ labels:\n verb: read\n record: apiserver_request:burnrate5m\n -\
\ expr: |-\n (\n (\n # too slow\n sum by (cluster)\
\ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\
LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n \
\ -\n (\n (\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=~\"resource|\",le=~\"1(\\\\.0)?\"}[6h]))\n or\n \
\ vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
,scope=\"namespace\",le=~\"5(\\\\.0)?\"}[6h]))\n +\n sum\
\ by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"apiserver\"\
,verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"\
cluster\",le=~\"30(\\\\.0)?\"}[6h]))\n )\n )\n +\n \
\ # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n )\n /\n sum by (cluster)\
\ (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n\
\ labels:\n verb: read\n record: apiserver_request:burnrate6h\n -\
\ expr: |-\n (\n (\n # too slow\n sum by (cluster)\
\ (rate(apiserver_request_sli_duration_seconds_count{job=\"apiserver\",verb=~\"\
POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n\
\ -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
,le=~\"1(\\\\.0)?\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n )\n \
\ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n labels:\n verb: write\n record:\
\ apiserver_request:burnrate1d\n - expr: |-\n (\n (\n #\
\ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
,le=~\"1(\\\\.0)?\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n )\n \
\ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n labels:\n verb: write\n record:\
\ apiserver_request:burnrate1h\n - expr: |-\n (\n (\n #\
\ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
,le=~\"1(\\\\.0)?\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n )\n \
\ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n labels:\n verb: write\n record:\
\ apiserver_request:burnrate2h\n - expr: |-\n (\n (\n #\
\ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
,le=~\"1(\\\\.0)?\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n )\n \
\ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n labels:\n verb: write\n \
\ record: apiserver_request:burnrate30m\n - expr: |-\n (\n (\n \
\ # too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
,le=~\"1(\\\\.0)?\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n )\n \
\ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n labels:\n verb: write\n record:\
\ apiserver_request:burnrate3d\n - expr: |-\n (\n (\n #\
\ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
,le=~\"1(\\\\.0)?\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n )\n \
\ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n labels:\n verb: write\n record:\
\ apiserver_request:burnrate5m\n - expr: |-\n (\n (\n #\
\ too slow\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
,le=~\"1(\\\\.0)?\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n )\n \
\ /\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\"\
,verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n labels:\n verb: write\n record:\
\ apiserver_request:burnrate6h\n"
monitoring-kube-prometheus-stack-kube-apiserver-histogram.rules-3df1ba86-8ec2-4750-a04f-5e59108c7ba3.yaml: "groups:\n\
- name: kube-apiserver-histogram.rules\n rules:\n - expr: histogram_quantile(0.99,\
\ sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[5m])))\n > 0\n labels:\n quantile: \"0.99\"\n verb: read\n\
\ record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job=\"\
apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"\
}[5m])))\n > 0\n labels:\n quantile: \"0.99\"\n verb: write\n\
\ record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile\n"
monitoring-kube-prometheus-stack-kube-apiserver-slos-be960e5a-cbba-488b-b2a9-b89b70183179.yaml: "groups:\n\
- name: kube-apiserver-slos\n rules:\n - alert: KubeAPIErrorBudgetBurn\n \
\ annotations:\n description: The API server is burning too much error budget\
\ on cluster {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn\n\
\ summary: The API server is burning too much error budget.\n expr: |-\n\
\ sum by (cluster) (apiserver_request:burnrate1h) > (14.40 * 0.01000)\n \
\ and on (cluster)\n sum by (cluster) (apiserver_request:burnrate5m)\
\ > (14.40 * 0.01000)\n for: 2m\n labels:\n long: 1h\n severity:\
\ critical\n short: 5m\n - alert: KubeAPIErrorBudgetBurn\n annotations:\n\
\ description: The API server is burning too much error budget on cluster\
\ {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn\n\
\ summary: The API server is burning too much error budget.\n expr: |-\n\
\ sum by (cluster) (apiserver_request:burnrate6h) > (6.00 * 0.01000)\n \
\ and on (cluster)\n sum by (cluster) (apiserver_request:burnrate30m)\
\ > (6.00 * 0.01000)\n for: 15m\n labels:\n long: 6h\n severity:\
\ critical\n short: 30m\n - alert: KubeAPIErrorBudgetBurn\n annotations:\n\
\ description: The API server is burning too much error budget on cluster\
\ {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn\n\
\ summary: The API server is burning too much error budget.\n expr: |-\n\
\ sum by (cluster) (apiserver_request:burnrate1d) > (3.00 * 0.01000)\n \
\ and on (cluster)\n sum by (cluster) (apiserver_request:burnrate2h) >\
\ (3.00 * 0.01000)\n for: 1h\n labels:\n long: 1d\n severity:\
\ warning\n short: 2h\n - alert: KubeAPIErrorBudgetBurn\n annotations:\n\
\ description: The API server is burning too much error budget on cluster\
\ {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn\n\
\ summary: The API server is burning too much error budget.\n expr: |-\n\
\ sum by (cluster) (apiserver_request:burnrate3d) > (1.00 * 0.01000)\n \
\ and on (cluster)\n sum by (cluster) (apiserver_request:burnrate6h) >\
\ (1.00 * 0.01000)\n for: 3h\n labels:\n long: 3d\n severity:\
\ warning\n short: 6h\n"
monitoring-kube-prometheus-stack-kube-prometheus-general.rules-cdf9488c-4fa3-4c7a-9be4-ddcaee437598.yaml: "groups:\n\
- name: kube-prometheus-general.rules\n rules:\n - expr: count without(instance,\
\ pod, node) (up == 1)\n record: count:up1\n - expr: count without(instance,\
\ pod, node) (up == 0)\n record: count:up0\n"
monitoring-kube-prometheus-stack-kube-prometheus-node-recording.rules-14de50cd-57b8-4248-a7c5-054469786b93.yaml: "groups:\n\
- name: kube-prometheus-node-recording.rules\n rules:\n - expr: sum(rate(node_cpu_seconds_total{mode!=\"\
idle\",mode!=\"iowait\",mode!=\"steal\"}[3m]))\n BY (instance)\n record:\
\ instance:node_cpu:rate:sum\n - expr: sum(rate(node_network_receive_bytes_total[3m]))\
\ BY (instance)\n record: instance:node_network_receive_bytes:rate:sum\n -\
\ expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)\n record:\
\ instance:node_network_transmit_bytes:rate:sum\n - expr: sum(rate(node_cpu_seconds_total{mode!=\"\
idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))\n WITHOUT (cpu, mode) / ON\
\ (instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)\n BY (instance,\
\ cpu)) BY (instance)\n record: instance:node_cpu:ratio\n - expr: sum(rate(node_cpu_seconds_total{mode!=\"\
idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))\n record: cluster:node_cpu:sum_rate5m\n\
\ - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY\
\ (instance,\n cpu))\n record: cluster:node_cpu:ratio\n"
monitoring-kube-prometheus-stack-kube-scheduler.rules-da454218-3276-463d-abe3-7043553c8f35.yaml: "groups:\n\
- name: kube-scheduler.rules\n rules:\n - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.99\"\n record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.99\"\n record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.99, sum(rate(scheduler_pod_scheduling_sli_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.99\"\n record: cluster_quantile:scheduler_pod_scheduling_sli_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.9\"\n record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.9\"\n record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.9, sum(rate(scheduler_pod_scheduling_sli_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.9\"\n record: cluster_quantile:scheduler_pod_scheduling_sli_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.5\"\n record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.5\"\n record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile\n\
\ - expr: histogram_quantile(0.5, sum(rate(scheduler_pod_scheduling_sli_duration_seconds_bucket{job=\"\
kube-scheduler\"}[5m]))\n without(instance, pod))\n labels:\n quantile:\
\ \"0.5\"\n record: cluster_quantile:scheduler_pod_scheduling_sli_duration_seconds:histogram_quantile\n"
monitoring-kube-prometheus-stack-kube-state-metrics-9826e852-e343-4d08-9f0c-4c5896358ba2.yaml: "groups:\n\
- name: kube-state-metrics\n rules:\n - alert: KubeStateMetricsListErrors\n\
\ annotations:\n description: kube-state-metrics is experiencing errors\
\ at an elevated rate in\n list operations. This is likely causing it to\
\ not be able to expose metrics\n about Kubernetes objects correctly or\
\ at all.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors\n\
\ summary: kube-state-metrics is experiencing errors in list operations.\n\
\ expr: |-\n (sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"\
,result=\"error\"}[5m])) by (cluster)\n /\n sum(rate(kube_state_metrics_list_total{job=\"\
kube-state-metrics\"}[5m])) by (cluster))\n > 0.01\n for: 15m\n labels:\n\
\ severity: critical\n - alert: KubeStateMetricsWatchErrors\n annotations:\n\
\ description: kube-state-metrics is experiencing errors at an elevated rate\
\ in\n watch operations. This is likely causing it to not be able to expose\
\ metrics\n about Kubernetes objects correctly or at all.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors\n\
\ summary: kube-state-metrics is experiencing errors in watch operations.\n\
\ expr: |-\n (sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"\
,result=\"error\"}[5m])) by (cluster)\n /\n sum(rate(kube_state_metrics_watch_total{job=\"\
kube-state-metrics\"}[5m])) by (cluster))\n > 0.01\n for: 15m\n labels:\n\
\ severity: critical\n - alert: KubeStateMetricsShardingMismatch\n annotations:\n\
\ description: kube-state-metrics pods are running with different --total-shards\n\
\ configuration, some Kubernetes objects may be exposed multiple times\
\ or not\n exposed at all.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch\n\
\ summary: kube-state-metrics sharding is misconfigured.\n expr: stdvar\
\ (kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) by (cluster)\n\
\ != 0\n for: 15m\n labels:\n severity: critical\n - alert: KubeStateMetricsShardsMissing\n\
\ annotations:\n description: kube-state-metrics shards are missing, some\
\ Kubernetes objects\n are not being exposed.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing\n\
\ summary: kube-state-metrics shards are missing.\n expr: |-\n 2^max(kube_state_metrics_total_shards{job=\"\
kube-state-metrics\"}) by (cluster) - 1\n -\n sum( 2 ^ max by (cluster,\
\ shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"\
}) ) by (cluster)\n != 0\n for: 15m\n labels:\n severity: critical\n"
monitoring-kube-prometheus-stack-kubelet.rules-931b5e0c-2d70-4f8b-9987-5b5cfaac8845.yaml: "groups:\n\
- name: kubelet.rules\n rules:\n - expr: |-\n histogram_quantile(\n \
\ 0.99,\n sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job=\"\
kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)\n \
\ * on (cluster, instance) group_left (node)\n max by (cluster, instance,\
\ node) (kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n \
\ )\n labels:\n quantile: \"0.99\"\n record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile\n\
\ - expr: |-\n histogram_quantile(\n 0.9,\n sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job=\"\
kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)\n \
\ * on (cluster, instance) group_left (node)\n max by (cluster, instance,\
\ node) (kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n \
\ )\n labels:\n quantile: \"0.9\"\n record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile\n\
\ - expr: |-\n histogram_quantile(\n 0.5,\n sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job=\"\
kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)\n \
\ * on (cluster, instance) group_left (node)\n max by (cluster, instance,\
\ node) (kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"})\n \
\ )\n labels:\n quantile: \"0.5\"\n record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile\n"
monitoring-kube-prometheus-stack-kubernetes-apps-eaf9c0ce-babd-40f5-913f-7c8c14272dcc.yaml: "groups:\n\
- name: kubernetes-apps\n rules:\n - alert: KubePodCrashLooping\n annotations:\n\
\ description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container\n\
\ }}) is in waiting state (reason: \"CrashLoopBackOff\") on cluster {{\
\ $labels.cluster\n }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping\n\
\ summary: Pod is crash looping.\n expr: max_over_time(kube_pod_container_status_waiting_reason{reason=\"\
CrashLoopBackOff\",\n job=\"kube-state-metrics\", namespace=~\".*\"}[5m])\
\ >= 1\n for: 15m\n labels:\n severity: warning\n - alert: KubePodNotReady\n\
\ annotations:\n description: Pod {{ $labels.namespace }}/{{ $labels.pod\
\ }} has been in a non-ready\n state for longer than 15 minutes on cluster\
\ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready\n\
\ summary: Pod has been in a non-ready state for more than 15 minutes.\n\
\ expr: |-\n sum by (namespace, pod, job, cluster) (\n max by (namespace,\
\ pod, job, cluster) (\n kube_pod_status_phase{job=\"kube-state-metrics\"\
, namespace=~\".*\", phase=~\"Pending|Unknown\"}\n ) * on (namespace, pod,\
\ cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (\n \
\ 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"\
Job\"})\n )\n ) > 0\n for: 15m\n labels:\n severity: warning\n\
\ - alert: KubeDeploymentGenerationMismatch\n annotations:\n description:\
\ Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment\n \
\ }} does not match, this indicates that the Deployment has failed but has\
\ not\n been rolled back on cluster {{ $labels.cluster }}.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch\n\
\ summary: Deployment generation mismatch due to possible roll-back\n \
\ expr: |-\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n !=\n kube_deployment_metadata_generation{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n for: 15m\n labels:\n severity:\
\ warning\n - alert: KubeDeploymentReplicasMismatch\n annotations:\n \
\ description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has\n\
\ not matched the expected number of replicas for longer than 15 minutes\
\ on\n cluster {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch\n\
\ summary: Deployment has not matched the expected number of replicas.\n\
\ expr: |-\n (\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n >\n kube_deployment_status_replicas_available{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n ) and (\n changes(kube_deployment_status_replicas_updated{job=\"\
kube-state-metrics\", namespace=~\".*\"}[10m])\n ==\n 0\n \
\ )\n for: 15m\n labels:\n severity: warning\n - alert: KubeDeploymentRolloutStuck\n\
\ annotations:\n description: Rollout of deployment {{ $labels.namespace\
\ }}/{{ $labels.deployment\n }} is not progressing for longer than 15 minutes\
\ on cluster {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck\n\
\ summary: Deployment rollout is not progressing.\n expr: |-\n kube_deployment_status_condition{condition=\"\
Progressing\", status=\"false\",job=\"kube-state-metrics\", namespace=~\".*\"\
}\n != 0\n for: 15m\n labels:\n severity: warning\n - alert:\
\ KubeStatefulSetReplicasMismatch\n annotations:\n description: StatefulSet\
\ {{ $labels.namespace }}/{{ $labels.statefulset }} has\n not matched the\
\ expected number of replicas for longer than 15 minutes on\n cluster {{\
\ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch\n\
\ summary: StatefulSet has not matched the expected number of replicas.\n\
\ expr: |-\n (\n kube_statefulset_status_replicas_ready{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n !=\n kube_statefulset_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n ) and (\n changes(kube_statefulset_status_replicas_updated{job=\"\
kube-state-metrics\", namespace=~\".*\"}[10m])\n ==\n 0\n \
\ )\n for: 15m\n labels:\n severity: warning\n - alert: KubeStatefulSetGenerationMismatch\n\
\ annotations:\n description: StatefulSet generation for {{ $labels.namespace\
\ }}/{{ $labels.statefulset\n }} does not match, this indicates that the\
\ StatefulSet has failed but has\n not been rolled back on cluster {{ $labels.cluster\
\ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch\n\
\ summary: StatefulSet generation mismatch due to possible roll-back\n \
\ expr: |-\n kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n !=\n kube_statefulset_metadata_generation{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n for: 15m\n labels:\n severity:\
\ warning\n - alert: KubeStatefulSetUpdateNotRolledOut\n annotations:\n \
\ description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}\
\ update\n has not been rolled out on cluster {{ $labels.cluster }}.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout\n\
\ summary: StatefulSet update has not been rolled out.\n expr: |-\n \
\ (\n max by (namespace, statefulset, job, cluster) (\n kube_statefulset_status_current_revision{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n unless\n kube_statefulset_status_update_revision{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n )\n * on (namespace,\
\ statefulset, job, cluster)\n (\n kube_statefulset_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n )\n ) and on (namespace,\
\ statefulset, job, cluster) (\n changes(kube_statefulset_status_replicas_updated{job=\"\
kube-state-metrics\", namespace=~\".*\"}[5m])\n ==\n 0\n \
\ )\n for: 15m\n labels:\n severity: warning\n - alert: KubeDaemonSetRolloutStuck\n\
\ annotations:\n description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset\
\ }} has not\n finished or progressed for at least 15m on cluster {{ $labels.cluster\
\ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck\n\
\ summary: DaemonSet rollout is stuck.\n expr: |-\n (\n (\n\
\ kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n !=\n 0\n \
\ ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n ) or (\n kube_daemonset_status_number_available{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n )\n ) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"\
kube-state-metrics\", namespace=~\".*\"}[5m])\n ==\n 0\n \
\ )\n for: 15m\n labels:\n severity: warning\n - alert: KubeContainerWaiting\n\
\ annotations:\n description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace\
\ }} on\n container {{ $labels.container}} has been in waiting state for\
\ longer than\n 1 hour. (reason: \"{{ $labels.reason }}\") on cluster {{\
\ $labels.cluster }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting\n\
\ summary: Pod container waiting longer than 1 hour\n expr: kube_pod_container_status_waiting_reason{reason!=\"\
CrashLoopBackOff\", job=\"kube-state-metrics\",\n namespace=~\".*\"} > 0\n\
\ for: 1h\n labels:\n severity: warning\n - alert: KubeDaemonSetNotScheduled\n\
\ annotations:\n description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace\
\ }}/{{ $labels.daemonset\n }} are not scheduled on cluster {{ $labels.cluster\
\ }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled\n\
\ summary: DaemonSet pods are not scheduled.\n expr: |-\n kube_daemonset_status_desired_number_scheduled{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n -\n kube_daemonset_status_current_number_scheduled{job=\"\
kube-state-metrics\", namespace=~\".*\"} > 0\n for: 10m\n labels:\n \
\ severity: warning\n - alert: KubeDaemonSetMisScheduled\n annotations:\n\
\ description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{\
\ $labels.daemonset\n }} are running where they are not supposed to run\
\ on cluster {{ $labels.cluster\n }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled\n\
\ summary: DaemonSet pods are misscheduled.\n expr: kube_daemonset_status_number_misscheduled{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n > 0\n for: 15m\n labels:\n\
\ severity: warning\n - alert: KubeJobNotCompleted\n annotations:\n \
\ description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking\
\ more\n than {{ \"43200\" | humanizeDuration }} to complete on cluster\
\ {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted\n\
\ summary: Job did not complete in time\n expr: |-\n time() - max\
\ by (namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n and\n kube_job_status_active{job=\"kube-state-metrics\"\
, namespace=~\".*\"} > 0) > 43200\n labels:\n severity: warning\n - alert:\
\ KubeJobFailed\n annotations:\n description: Job {{ $labels.namespace\
\ }}/{{ $labels.job_name }} failed to complete.\n Removing failed job after\
\ investigation should clear this alert on cluster\n {{ $labels.cluster\
\ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed\n\
\ summary: Job failed to complete.\n expr: kube_job_failed{job=\"kube-state-metrics\"\
, namespace=~\".*\"} > 0\n for: 15m\n labels:\n severity: warning\n\
\ - alert: KubeHpaReplicasMismatch\n annotations:\n description: HPA\
\ {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}\n has\
\ not matched the desired number of replicas for longer than 15 minutes\n \
\ on cluster {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch\n\
\ summary: HPA has not matched desired number of replicas.\n expr: |-\n\
\ (kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n !=\n kube_horizontalpodautoscaler_status_current_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"})\n and\n (kube_horizontalpodautoscaler_status_current_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n >\n kube_horizontalpodautoscaler_spec_min_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"})\n and\n (kube_horizontalpodautoscaler_status_current_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n <\n kube_horizontalpodautoscaler_spec_max_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"})\n and\n changes(kube_horizontalpodautoscaler_status_current_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"}[15m]) == 0\n for: 15m\n labels:\n\
\ severity: warning\n - alert: KubeHpaMaxedOut\n annotations:\n \
\ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler\
\ }}\n has been running at max replicas for longer than 15 minutes on\
\ cluster {{\n $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout\n\
\ summary: HPA is running at max replicas\n expr: |-\n (\n \
\ kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n ==\n kube_horizontalpodautoscaler_spec_max_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n )\n and on (namespace, horizontalpodautoscaler)\
\ (\n kube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n !=\n kube_horizontalpodautoscaler_spec_min_replicas{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n )\n for: 15m\n labels:\n\
\ severity: warning\n - alert: KubePdbNotEnoughHealthyPods\n annotations:\n\
\ description: PDB {{ $labels.cluster }}/{{ $labels.namespace }}/{{ $labels.poddisruptionbudget\n\
\ }} expects {{ $value }} more healthy pods. The desired number of healthy\
\ pods\n has not been met for at least 15m.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepdbnotenoughhealthypods\n\
\ summary: PDB does not have enough healthy pods.\n expr: |-\n (\n\
\ kube_poddisruptionbudget_status_desired_healthy{job=\"kube-state-metrics\"\
, namespace=~\".*\"}\n -\n kube_poddisruptionbudget_status_current_healthy{job=\"\
kube-state-metrics\", namespace=~\".*\"}\n )\n > 0\n for: 15m\n \
\ labels:\n severity: warning\n"
monitoring-kube-prometheus-stack-kubernetes-resources-b694afc5-821c-4800-a61c-a61d36f5c15f.yaml: "groups:\n\
- name: kubernetes-resources\n rules:\n - alert: KubeCPUOvercommit\n annotations:\n\
\ description: Cluster {{ $labels.cluster }} has overcommitted CPU resource\
\ requests\n for Pods by {{ printf \"%.2f\" $value }} CPU shares and cannot\
\ tolerate node\n failure.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit\n\
\ summary: Cluster has overcommitted CPU resource requests.\n expr: |-\n\
\ # Non-HA clusters.\n (\n (\n sum by (cluster) (namespace_cpu:kube_pod_container_resource_requests:sum{})\n\
\ -\n sum by (cluster) (kube_node_status_allocatable{job=\"\
kube-state-metrics\",resource=\"cpu\"}) > 0\n )\n and\n count\
\ by (cluster) (max by (cluster, node) (kube_node_role{job=\"kube-state-metrics\"\
, role=\"control-plane\"})) < 3\n )\n or\n # HA clusters.\n \
\ (\n sum by (cluster) (namespace_cpu:kube_pod_container_resource_requests:sum{})\n\
\ -\n (\n # Skip clusters with only one allocatable node.\n\
\ (\n sum by (cluster) (kube_node_status_allocatable{job=\"\
kube-state-metrics\",resource=\"cpu\"})\n -\n max by (cluster)\
\ (kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"})\n\
\ ) > 0\n ) > 0\n )\n for: 10m\n labels:\n severity:\
\ warning\n - alert: KubeMemoryOvercommit\n annotations:\n description:\
\ Cluster {{ $labels.cluster }} has overcommitted memory resource\n requests\
\ for Pods by {{ $value | humanize }} bytes and cannot tolerate node\n \
\ failure.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit\n\
\ summary: Cluster has overcommitted memory resource requests.\n expr:\
\ |-\n # Non-HA clusters.\n (\n (\n sum by (cluster)\
\ (namespace_memory:kube_pod_container_resource_requests:sum{})\n -\n\
\ sum by (cluster) (kube_node_status_allocatable{job=\"kube-state-metrics\"\
,resource=\"memory\"}) > 0\n )\n and\n count by (cluster)\
\ (max by (cluster, node) (kube_node_role{job=\"kube-state-metrics\", role=\"\
control-plane\"})) < 3\n )\n or\n # HA clusters.\n (\n \
\ sum by (cluster) (namespace_memory:kube_pod_container_resource_requests:sum{})\n\
\ -\n (\n # Skip clusters with only one allocatable node.\n\
\ (\n sum by (cluster) (kube_node_status_allocatable{job=\"\
kube-state-metrics\",resource=\"memory\"})\n -\n max by\
\ (cluster) (kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"\
memory\"})\n ) > 0\n ) > 0\n )\n for: 10m\n labels:\n\
\ severity: warning\n - alert: KubeCPUQuotaOvercommit\n annotations:\n\
\ description: Cluster {{ $labels.cluster }} has overcommitted CPU resource\
\ requests\n for Namespaces.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit\n\
\ summary: Cluster has overcommitted CPU resource requests.\n expr: |-\n\
\ sum by (cluster) (\n min without(resource) (kube_resourcequota{job=\"\
kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"})\n \
\ )\n /\n sum by (cluster) (\n kube_node_status_allocatable{resource=\"\
cpu\", job=\"kube-state-metrics\"}\n ) > 1.5\n for: 5m\n labels:\n\
\ severity: warning\n - alert: KubeMemoryQuotaOvercommit\n annotations:\n\
\ description: Cluster {{ $labels.cluster }} has overcommitted memory resource\n\
\ requests for Namespaces.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit\n\
\ summary: Cluster has overcommitted memory resource requests.\n expr:\
\ |-\n sum by (cluster) (\n min without(resource) (kube_resourcequota{job=\"\
kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})\n\
\ )\n /\n sum by (cluster) (\n kube_node_status_allocatable{resource=\"\
memory\", job=\"kube-state-metrics\"}\n ) > 1.5\n for: 5m\n labels:\n\
\ severity: warning\n - alert: KubeQuotaAlmostFull\n annotations:\n \
\ description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage\n\
\ }} of its {{ $labels.resource }} quota on cluster {{ $labels.cluster\
\ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull\n\
\ summary: Namespace quota is going to be full.\n expr: |-\n max\
\ without (instance, job, type) (\n kube_resourcequota{job=\"kube-state-metrics\"\
, type=\"used\"}\n )\n / on (cluster, namespace, resource, resourcequota)\
\ group_left()\n (\n max without (instance, job, type) (\n \
\ kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"}\n ) >\
\ 0\n )\n > 0.9 < 1\n for: 15m\n labels:\n severity: info\n\
\ - alert: KubeQuotaFullyUsed\n annotations:\n description: Namespace\
\ {{ $labels.namespace }} is using {{ $value | humanizePercentage\n }}\
\ of its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.\n \
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused\n\
\ summary: Namespace quota is fully used.\n expr: |-\n max without\
\ (instance, job, type) (\n kube_resourcequota{job=\"kube-state-metrics\"\
, type=\"used\"}\n )\n / on (cluster, namespace, resource, resourcequota)\
\ group_left()\n (\n max without (instance, job, type) (\n \
\ kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"}\n ) >\
\ 0\n )\n == 1\n for: 15m\n labels:\n severity: info\n -\
\ alert: KubeQuotaExceeded\n annotations:\n description: Namespace {{\
\ $labels.namespace }} is using {{ $value | humanizePercentage\n }} of\
\ its {{ $labels.resource }} quota on cluster {{ $labels.cluster }}.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded\n\
\ summary: Namespace quota has exceeded the limits.\n expr: |-\n \
\ max without (instance, job, type) (\n kube_resourcequota{job=\"kube-state-metrics\"\
, type=\"used\"}\n )\n / on (cluster, namespace, resource, resourcequota)\
\ group_left()\n (\n max without (instance, job, type) (\n \
\ kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"}\n ) >\
\ 0\n ) > 1\n for: 15m\n labels:\n severity: warning\n - alert:\
\ CPUThrottlingHigh\n annotations:\n description: '{{ $value | humanizePercentage\
\ }} throttling of CPU in namespace\n {{ $labels.namespace }} for container\
\ {{ $labels.container }} in pod {{ $labels.pod\n }} on cluster {{ $labels.cluster\
\ }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh\n\
\ summary: Processes experience elevated CPU throttling.\n expr: |-\n\
\ sum without (id, metrics_path, name, image, endpoint, job, node) (\n \
\ topk by (cluster, namespace, pod, container, instance) (1,\n increase(\n\
\ container_cpu_cfs_throttled_periods_total{container!=\"\", job=\"\
kubelet\", metrics_path=\"/metrics/cadvisor\", }\n [5m])\n )\n\
\ )\n / on (cluster, namespace, pod, container, instance) group_left\n\
\ sum without (id, metrics_path, name, image, endpoint, job, node) (\n \
\ topk by (cluster, namespace, pod, container, instance) (1,\n increase(\n\
\ container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"\
/metrics/cadvisor\", }\n [5m])\n )\n )\n > ( 25 / 100\
\ )\n for: 15m\n labels:\n severity: info\n"
monitoring-kube-prometheus-stack-kubernetes-storage-a21970f1-cefe-4cfc-876a-1833115df2e4.yaml: "groups:\n\
- name: kubernetes-storage\n rules:\n - alert: KubePersistentVolumeFillingUp\n\
\ annotations:\n description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim\n\
\ }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on\
\ Cluster\n {{ . }} {{- end }} is only {{ $value | humanizePercentage }}\
\ free.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup\n\
\ summary: PersistentVolume is filling up.\n expr: |-\n (\n \
\ kubelet_volume_stats_available_bytes{job=\"kubelet\", namespace=~\".*\", metrics_path=\"\
/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\"\
, namespace=~\".*\", metrics_path=\"/metrics\"}\n ) < 0.03\n and\n \
\ kubelet_volume_stats_used_bytes{job=\"kubelet\", namespace=~\".*\", metrics_path=\"\
/metrics\"} > 0\n unless on (cluster, namespace, persistentvolumeclaim)\n\
\ kube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} ==\
\ 1\n unless on (cluster, namespace, persistentvolumeclaim)\n kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"\
true\"} == 1\n for: 1m\n labels:\n severity: critical\n - alert: KubePersistentVolumeFillingUp\n\
\ annotations:\n description: Based on recent sampling, the PersistentVolume\
\ claimed by {{ $labels.persistentvolumeclaim\n }} in Namespace {{ $labels.namespace\
\ }} {{ with $labels.cluster -}} on Cluster\n {{ . }} {{- end }} is expected\
\ to fill up within four days. Currently {{ $value\n | humanizePercentage\
\ }} is available.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup\n\
\ summary: PersistentVolume is filling up.\n expr: |-\n (\n \
\ kubelet_volume_stats_available_bytes{job=\"kubelet\", namespace=~\".*\", metrics_path=\"\
/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\"\
, namespace=~\".*\", metrics_path=\"/metrics\"}\n ) < 0.15\n and\n \
\ kubelet_volume_stats_used_bytes{job=\"kubelet\", namespace=~\".*\", metrics_path=\"\
/metrics\"} > 0\n and\n predict_linear(kubelet_volume_stats_available_bytes{job=\"\
kubelet\", namespace=~\".*\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) <\
\ 0\n unless on (cluster, namespace, persistentvolumeclaim)\n kube_persistentvolumeclaim_access_mode{\
\ access_mode=\"ReadOnlyMany\"} == 1\n unless on (cluster, namespace, persistentvolumeclaim)\n\
\ kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}\
\ == 1\n for: 1h\n labels:\n severity: warning\n - alert: KubePersistentVolumeInodesFillingUp\n\
\ annotations:\n description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim\n\
\ }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on\
\ Cluster\n {{ . }} {{- end }} only has {{ $value | humanizePercentage\
\ }} free inodes.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup\n\
\ summary: PersistentVolumeInodes are filling up.\n expr: |-\n (\n\
\ kubelet_volume_stats_inodes_free{job=\"kubelet\", namespace=~\".*\",\
\ metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"\
kubelet\", namespace=~\".*\", metrics_path=\"/metrics\"}\n ) < 0.03\n \
\ and\n kubelet_volume_stats_inodes_used{job=\"kubelet\", namespace=~\"\
.*\", metrics_path=\"/metrics\"} > 0\n unless on (cluster, namespace, persistentvolumeclaim)\n\
\ kube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} ==\
\ 1\n unless on (cluster, namespace, persistentvolumeclaim)\n kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"\
true\"} == 1\n for: 1m\n labels:\n severity: critical\n - alert: KubePersistentVolumeInodesFillingUp\n\
\ annotations:\n description: Based on recent sampling, the PersistentVolume\
\ claimed by {{ $labels.persistentvolumeclaim\n }} in Namespace {{ $labels.namespace\
\ }} {{ with $labels.cluster -}} on Cluster\n {{ . }} {{- end }} is expected\
\ to run out of inodes within four days. Currently\n {{ $value | humanizePercentage\
\ }} of its inodes are free.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup\n\
\ summary: PersistentVolumeInodes are filling up.\n expr: |-\n (\n\
\ kubelet_volume_stats_inodes_free{job=\"kubelet\", namespace=~\".*\",\
\ metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"\
kubelet\", namespace=~\".*\", metrics_path=\"/metrics\"}\n ) < 0.15\n \
\ and\n kubelet_volume_stats_inodes_used{job=\"kubelet\", namespace=~\"\
.*\", metrics_path=\"/metrics\"} > 0\n and\n predict_linear(kubelet_volume_stats_inodes_free{job=\"\
kubelet\", namespace=~\".*\", metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) <\
\ 0\n unless on (cluster, namespace, persistentvolumeclaim)\n kube_persistentvolumeclaim_access_mode{\
\ access_mode=\"ReadOnlyMany\"} == 1\n unless on (cluster, namespace, persistentvolumeclaim)\n\
\ kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}\
\ == 1\n for: 1h\n labels:\n severity: warning\n - alert: KubePersistentVolumeErrors\n\
\ annotations:\n description: The persistent volume {{ $labels.persistentvolume\
\ }} {{ with $labels.cluster\n -}} on Cluster {{ . }} {{- end }} has status\
\ {{ $labels.phase }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors\n\
\ summary: PersistentVolume is having issues with provisioning.\n expr:\
\ kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"\
}\n > 0\n for: 5m\n labels:\n severity: critical\n"
monitoring-kube-prometheus-stack-kubernetes-system-26e1e614-9a40-44cd-8622-cff2d1258a88.yaml: "groups:\n\
- name: kubernetes-system\n rules:\n - alert: KubeVersionMismatch\n annotations:\n\
\ description: There are {{ $value }} different semantic versions of Kubernetes\n\
\ components running on cluster {{ $labels.cluster }}.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch\n\
\ summary: Different semantic versions of Kubernetes components running.\n\
\ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"\
kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"\
)))\n > 1\n for: 15m\n labels:\n severity: warning\n - alert:\
\ KubeClientErrors\n annotations:\n description: Kubernetes API server\
\ client '{{ $labels.job }}/{{ $labels.instance\n }}' is experiencing {{\
\ $value | humanizePercentage }} errors on cluster {{\n $labels.cluster\
\ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors\n\
\ summary: Kubernetes API server client is experiencing errors.\n expr:\
\ |-\n (sum(rate(rest_client_requests_total{job=\"apiserver\",code=~\"5..\"\
}[5m])) by (cluster, instance, job, namespace)\n /\n sum(rate(rest_client_requests_total{job=\"\
apiserver\"}[5m])) by (cluster, instance, job, namespace))\n > 0.01\n \
\ for: 15m\n labels:\n severity: warning\n"
monitoring-kube-prometheus-stack-kubernetes-system-apiserver-f9528ec5-b467-4c90-829a-f4603a9bc7d5.yaml: "groups:\n\
- name: kubernetes-system-apiserver\n rules:\n - alert: KubeClientCertificateExpiration\n\
\ annotations:\n description: A client certificate used to authenticate\
\ to kubernetes apiserver\n is expiring in less than 7.0 days on cluster\
\ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration\n\
\ summary: Client certificate is about to expire.\n expr: |-\n histogram_quantile(0.01,\
\ sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"\
apiserver\"}[5m]))) < 604800\n and\n on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"\
apiserver\"} > 0\n for: 5m\n labels:\n severity: warning\n - alert:\
\ KubeClientCertificateExpiration\n annotations:\n description: A client\
\ certificate used to authenticate to kubernetes apiserver\n is expiring\
\ in less than 24.0 hours on cluster {{ $labels.cluster }}.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration\n\
\ summary: Client certificate is about to expire.\n expr: |-\n histogram_quantile(0.01,\
\ sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"\
apiserver\"}[5m]))) < 86400\n and\n on (job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"\
apiserver\"} > 0\n for: 5m\n labels:\n severity: critical\n - alert:\
\ KubeAggregatedAPIErrors\n annotations:\n description: Kubernetes aggregated\
\ API {{ $labels.instance }}/{{ $labels.name\n }} has reported {{ $labels.reason\
\ }} errors on cluster {{ $labels.cluster\n }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors\n\
\ summary: Kubernetes aggregated API has reported errors.\n expr: sum\
\ by (cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job=\"\
apiserver\"}[1m]))\n > 0\n for: 10m\n labels:\n severity: warning\n\
\ - alert: KubeAggregatedAPIDown\n annotations:\n description: Kubernetes\
\ aggregated API {{ $labels.name }}/{{ $labels.namespace\n }} has been\
\ only {{ $value | humanize }}% available over the last 10m on cluster\n \
\ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown\n\
\ summary: Kubernetes aggregated API is down.\n expr: (1 - max by (name,\
\ namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job=\"apiserver\"\
}[10m])))\n * 100 < 85\n for: 5m\n labels:\n severity: warning\n\
\ - alert: KubeAPIDown\n annotations:\n description: KubeAPI has disappeared\
\ from Prometheus target discovery.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown\n\
\ summary: Target disappeared from Prometheus target discovery.\n expr:\
\ absent(up{job=\"apiserver\"})\n for: 15m\n labels:\n severity: critical\n\
\ - alert: KubeAPITerminatedRequests\n annotations:\n description: The\
\ kubernetes apiserver has terminated {{ $value | humanizePercentage\n \
\ }} of its incoming requests on cluster {{ $labels.cluster }}.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests\n\
\ summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage\n\
\ }} of its incoming requests.\n expr: sum by (cluster) (rate(apiserver_request_terminations_total{job=\"\
apiserver\"}[10m]))\n / ( sum by (cluster) (rate(apiserver_request_total{job=\"\
apiserver\"}[10m])) +\n sum by (cluster) (rate(apiserver_request_terminations_total{job=\"\
apiserver\"}[10m]))\n ) > 0.20\n for: 5m\n labels:\n severity:\
\ warning\n"
monitoring-kube-prometheus-stack-kubernetes-system-controller-manager-965b603b-05c3-4e36-9b70-30adcbb5400d.yaml: "groups:\n\
- name: kubernetes-system-controller-manager\n rules:\n - alert: KubeControllerManagerDown\n\
\ annotations:\n description: KubeControllerManager has disappeared from\
\ Prometheus target discovery.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown\n\
\ summary: Target disappeared from Prometheus target discovery.\n expr:\
\ absent(up{job=\"kube-controller-manager\"})\n for: 15m\n labels:\n \
\ severity: critical\n"
monitoring-kube-prometheus-stack-kubernetes-system-kube-proxy-b946dd15-5f3f-490e-a788-25134043fbfb.yaml: "groups:\n\
- name: kubernetes-system-kube-proxy\n rules:\n - alert: KubeProxyDown\n \
\ annotations:\n description: KubeProxy has disappeared from Prometheus target\
\ discovery.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeproxydown\n\
\ summary: Target disappeared from Prometheus target discovery.\n expr:\
\ absent(up{job=\"kube-proxy\"})\n for: 15m\n labels:\n severity: critical\n"
monitoring-kube-prometheus-stack-kubernetes-system-kubelet-3f5c198a-b883-4aa8-8f72-3001b24a1138.yaml: "groups:\n\
- name: kubernetes-system-kubelet\n rules:\n - alert: KubeNodeNotReady\n \
\ annotations:\n description: '{{ $labels.node }} has been unready for more\
\ than 15 minutes on\n cluster {{ $labels.cluster }}.'\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready\n\
\ summary: Node is not ready.\n expr: |-\n kube_node_status_condition{job=\"\
kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n and on (cluster,\
\ node)\n kube_node_spec_unschedulable{job=\"kube-state-metrics\"} == 0\n\
\ for: 15m\n labels:\n severity: warning\n - alert: KubeNodePressure\n\
\ annotations:\n description: '{{ $labels.node }} on cluster {{ $labels.cluster\
\ }} has active\n Condition {{ $labels.condition }}. This is caused by\
\ resource usage exceeding\n eviction thresholds.'\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodepressure\n\
\ summary: Node has as active Condition.\n expr: |-\n kube_node_status_condition{job=\"\
kube-state-metrics\",condition=~\"(MemoryPressure|DiskPressure|PIDPressure)\"\
,status=\"true\"} == 1\n and on (cluster, node)\n kube_node_spec_unschedulable{job=\"\
kube-state-metrics\"} == 0\n for: 10m\n labels:\n severity: info\n\
\ - alert: KubeNodeUnreachable\n annotations:\n description: '{{ $labels.node\
\ }} is unreachable and some workloads may be rescheduled\n on cluster\
\ {{ $labels.cluster }}.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable\n\
\ summary: Node is unreachable.\n expr: (kube_node_spec_taint{job=\"kube-state-metrics\"\
,key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"}\n unless ignoring(key,value)\
\ kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"\
})\n == 1\n for: 15m\n labels:\n severity: warning\n - alert:\
\ KubeletTooManyPods\n annotations:\n description: Kubelet '{{ $labels.node\
\ }}' is running at {{ $value | humanizePercentage\n }} of its Pod capacity\
\ on cluster {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods\n\
\ summary: Kubelet is running at capacity.\n expr: |-\n (\n \
\ max by (cluster, instance) (\n kubelet_running_pods{job=\"kubelet\"\
, metrics_path=\"/metrics\"} > 1\n )\n * on (cluster, instance)\
\ group_left(node)\n max by (cluster, instance, node) (\n kubelet_node_name{job=\"\
kubelet\", metrics_path=\"/metrics\"}\n )\n )\n / on (cluster,\
\ node) group_left()\n max by (cluster, node) (\n kube_node_status_capacity{job=\"\
kube-state-metrics\", resource=\"pods\"} != 1\n ) > 0.95\n for: 15m\n\
\ labels:\n severity: info\n - alert: KubeNodeReadinessFlapping\n \
\ annotations:\n description: The readiness status of node {{ $labels.node\
\ }} has changed {{\n $value }} times in the last 15 minutes on cluster\
\ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping\n\
\ summary: Node readiness status is flapping.\n expr: |-\n sum(changes(kube_node_status_condition{job=\"\
kube-state-metrics\",status=\"true\",condition=\"Ready\"}[15m])) by (cluster,\
\ node) > 2\n and on (cluster, node)\n kube_node_spec_unschedulable{job=\"\
kube-state-metrics\"} == 0\n for: 15m\n labels:\n severity: warning\n\
\ - alert: KubeNodeEviction\n annotations:\n description: Node {{ $labels.node\
\ }} on {{ $labels.cluster }} is evicting Pods\n due to {{ $labels.eviction_signal\
\ }}. Eviction occurs when eviction thresholds\n are crossed, typically\
\ caused by Pods exceeding RAM/ephemeral-storage limits.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeeviction\n\
\ summary: Node is evicting pods.\n expr: |-\n sum(rate(kubelet_evictions{job=\"\
kubelet\", metrics_path=\"/metrics\"}[15m])) by (cluster, eviction_signal, instance)\n\
\ * on (cluster, instance) group_left(node)\n max by (cluster, instance,\
\ node) (\n kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"\
}\n )\n > 0\n for: 0s\n labels:\n severity: info\n - alert:\
\ KubeletPlegDurationHigh\n annotations:\n description: The Kubelet Pod\
\ Lifecycle Event Generator has a 99th percentile\n duration of {{ $value\
\ }} seconds on node {{ $labels.node }} on cluster {{\n $labels.cluster\
\ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh\n\
\ summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.\n\
\ expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"\
0.99\"}\n >= 10\n for: 5m\n labels:\n severity: warning\n - alert:\
\ KubeletPodStartUpLatencyHigh\n annotations:\n description: Kubelet Pod\
\ startup 99th percentile latency is {{ $value }} seconds\n on node {{\
\ $labels.node }} on cluster {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh\n\
\ summary: Kubelet Pod startup latency is too high.\n expr: |-\n \
\ histogram_quantile(0.99,\n sum by (cluster, instance, le) (\n \
\ topk by (cluster, instance, le, operation_type) (1,\n rate(kubelet_pod_worker_duration_seconds_bucket{job=\"\
kubelet\", metrics_path=\"/metrics\"}[5m])\n )\n )\n )\n\
\ * on (cluster, instance) group_left(node)\n topk by (cluster, instance,\
\ node) (1,\n kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"\
}\n )\n > 60\n for: 15m\n labels:\n severity: warning\n \
\ - alert: KubeletClientCertificateExpiration\n annotations:\n description:\
\ Client certificate for Kubelet on node {{ $labels.node }} expires\n in\
\ {{ $value | humanizeDuration }} on cluster {{ $labels.cluster }}.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration\n\
\ summary: Kubelet client certificate is about to expire.\n expr: kubelet_certificate_manager_client_ttl_seconds\
\ < 604800\n labels:\n severity: warning\n - alert: KubeletClientCertificateExpiration\n\
\ annotations:\n description: Client certificate for Kubelet on node {{\
\ $labels.node }} expires\n in {{ $value | humanizeDuration }} on cluster\
\ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration\n\
\ summary: Kubelet client certificate is about to expire.\n expr: kubelet_certificate_manager_client_ttl_seconds\
\ < 86400\n labels:\n severity: critical\n - alert: KubeletServerCertificateExpiration\n\
\ annotations:\n description: Server certificate for Kubelet on node {{\
\ $labels.node }} expires\n in {{ $value | humanizeDuration }} on cluster\
\ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration\n\
\ summary: Kubelet server certificate is about to expire.\n expr: kubelet_certificate_manager_server_ttl_seconds\
\ < 604800\n labels:\n severity: warning\n - alert: KubeletServerCertificateExpiration\n\
\ annotations:\n description: Server certificate for Kubelet on node {{\
\ $labels.node }} expires\n in {{ $value | humanizeDuration }} on cluster\
\ {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration\n\
\ summary: Kubelet server certificate is about to expire.\n expr: kubelet_certificate_manager_server_ttl_seconds\
\ < 86400\n labels:\n severity: critical\n - alert: KubeletClientCertificateRenewalErrors\n\
\ annotations:\n description: Kubelet on node {{ $labels.node }} has failed\
\ to renew its client\n certificate ({{ $value | humanize }} errors in\
\ the last 5 minutes) on cluster\n {{ $labels.cluster }}.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors\n\
\ summary: Kubelet has failed to renew its client certificate.\n expr:\
\ increase(kubelet_certificate_manager_client_expiration_renew_errors[5m])\n \
\ > 0\n for: 15m\n labels:\n severity: warning\n - alert: KubeletServerCertificateRenewalErrors\n\
\ annotations:\n description: Kubelet on node {{ $labels.node }} has failed\
\ to renew its server\n certificate ({{ $value | humanize }} errors in\
\ the last 5 minutes) on cluster\n {{ $labels.cluster }}.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors\n\
\ summary: Kubelet has failed to renew its server certificate.\n expr:\
\ increase(kubelet_server_expiration_renew_errors[5m]) > 0\n for: 15m\n \
\ labels:\n severity: warning\n - alert: KubeletDown\n annotations:\n\
\ description: Kubelet has disappeared from Prometheus target discovery on\
\ cluster\n {{ $labels.cluster }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown\n\
\ summary: Target disappeared from Prometheus target discovery.\n expr:\
\ |-\n count by (cluster) (kube_node_info{job=\"kube-state-metrics\"})\n\
\ unless on (cluster)\n count by (cluster) (up{job=\"kubelet\", metrics_path=\"\
/metrics\"} == 1)\n for: 15m\n labels:\n severity: critical\n"
monitoring-kube-prometheus-stack-kubernetes-system-scheduler-d7bc55b3-9301-4c17-81d7-76c4590104da.yaml: "groups:\n\
- name: kubernetes-system-scheduler\n rules:\n - alert: KubeSchedulerDown\n\
\ annotations:\n description: KubeScheduler has disappeared from Prometheus\
\ target discovery.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown\n\
\ summary: Target disappeared from Prometheus target discovery.\n expr:\
\ absent(up{job=\"kube-scheduler\"})\n for: 15m\n labels:\n severity:\
\ critical\n"
monitoring-kube-prometheus-stack-node-exporter-bb0e2fd6-3e20-4883-9c47-3d8d2acb1ac3.yaml: "groups:\n\
- name: node-exporter\n rules:\n - alert: NodeFilesystemSpaceFillingUp\n \
\ annotations:\n description: Filesystem on {{ $labels.device }}, mounted\
\ on {{ $labels.mountpoint\n }}, at {{ $labels.instance }} has only {{\
\ printf \"%.2f\" $value }}% available\n space left and is filling up.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup\n\
\ summary: Filesystem is predicted to run out of space within the next 24\
\ hours.\n expr: |-\n (\n node_filesystem_avail_bytes{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} * 100 < 15\n and\n predict_linear(node_filesystem_avail_bytes{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0\n and\n\
\ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} == 0\n )\n for: 1h\n labels:\n severity: warning\n - alert:\
\ NodeFilesystemSpaceFillingUp\n annotations:\n description: Filesystem\
\ on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n }}, at {{\
\ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available\n \
\ space left and is filling up fast.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup\n\
\ summary: Filesystem is predicted to run out of space within the next 4\
\ hours.\n expr: |-\n (\n node_filesystem_avail_bytes{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} * 100 < 10\n and\n predict_linear(node_filesystem_avail_bytes{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0\n and\n\
\ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} == 0\n )\n for: 1h\n labels:\n severity: critical\n - alert:\
\ NodeFilesystemAlmostOutOfSpace\n annotations:\n description: Filesystem\
\ on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n }}, at {{\
\ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available\n \
\ space left.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace\n\
\ summary: Filesystem has less than 5% space left.\n expr: |-\n (\n\
\ node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} * 100 < 5\n and\n node_filesystem_readonly{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} == 0\n )\n for: 30m\n labels:\n \
\ severity: warning\n - alert: NodeFilesystemAlmostOutOfSpace\n annotations:\n\
\ description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n\
\ }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}%\
\ available\n space left.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace\n\
\ summary: Filesystem has less than 3% space left.\n expr: |-\n (\n\
\ node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} * 100 < 3\n and\n node_filesystem_readonly{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} == 0\n )\n for: 30m\n labels:\n \
\ severity: critical\n - alert: NodeFilesystemFilesFillingUp\n annotations:\n\
\ description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n\
\ }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}%\
\ available\n inodes left and is filling up.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup\n\
\ summary: Filesystem is predicted to run out of inodes within the next 24\
\ hours.\n expr: |-\n (\n node_filesystem_files_free{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} * 100 < 40\n and\n predict_linear(node_filesystem_files_free{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0\n and\n\
\ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} == 0\n )\n for: 1h\n labels:\n severity: warning\n - alert:\
\ NodeFilesystemFilesFillingUp\n annotations:\n description: Filesystem\
\ on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n }}, at {{\
\ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available\n \
\ inodes left and is filling up fast.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup\n\
\ summary: Filesystem is predicted to run out of inodes within the next 4\
\ hours.\n expr: |-\n (\n node_filesystem_files_free{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} * 100 < 20\n and\n predict_linear(node_filesystem_files_free{job=\"\
node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0\n and\n\
\ node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} == 0\n )\n for: 1h\n labels:\n severity: critical\n - alert:\
\ NodeFilesystemAlmostOutOfFiles\n annotations:\n description: Filesystem\
\ on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n }}, at {{\
\ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available\n \
\ inodes left.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles\n\
\ summary: Filesystem has less than 5% inodes left.\n expr: |-\n \
\ (\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"\
} * 100 < 5\n and\n node_filesystem_readonly{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} == 0\n )\n for: 1h\n labels:\n \
\ severity: warning\n - alert: NodeFilesystemAlmostOutOfFiles\n annotations:\n\
\ description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint\n\
\ }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}%\
\ available\n inodes left.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles\n\
\ summary: Filesystem has less than 3% inodes left.\n expr: |-\n \
\ (\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\
\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"\
} * 100 < 3\n and\n node_filesystem_readonly{job=\"node-exporter\"\
,fstype!=\"\",mountpoint!=\"\"} == 0\n )\n for: 1h\n labels:\n \
\ severity: critical\n - alert: NodeNetworkReceiveErrs\n annotations:\n \
\ description: '{{ $labels.instance }} interface {{ $labels.device }} has\
\ encountered\n {{ printf \"%.0f\" $value }} receive errors in the last\
\ two minutes.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs\n\
\ summary: Network interface is reporting many receive errors.\n expr:\
\ rate(node_network_receive_errs_total{job=\"node-exporter\"}[2m]) / rate(node_network_receive_packets_total{job=\"\
node-exporter\"}[2m])\n > 0.01\n for: 1h\n labels:\n severity:\
\ warning\n - alert: NodeNetworkTransmitErrs\n annotations:\n description:\
\ '{{ $labels.instance }} interface {{ $labels.device }} has encountered\n \
\ {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.'\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs\n\
\ summary: Network interface is reporting many transmit errors.\n expr:\
\ rate(node_network_transmit_errs_total{job=\"node-exporter\"}[2m]) / rate(node_network_transmit_packets_total{job=\"\
node-exporter\"}[2m])\n > 0.01\n for: 1h\n labels:\n severity:\
\ warning\n - alert: NodeHighNumberConntrackEntriesUsed\n annotations:\n \
\ description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of\
\ conntrack\n entries are used.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused\n\
\ summary: Number of conntrack are getting close to the limit.\n expr:\
\ (node_nf_conntrack_entries{job=\"node-exporter\"} / node_nf_conntrack_entries_limit)\n\
\ > 0.75\n labels:\n severity: warning\n - alert: NodeTextFileCollectorScrapeError\n\
\ annotations:\n description: Node Exporter text file collector on {{\
\ $labels.instance }} failed\n to scrape.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror\n\
\ summary: Node Exporter text file collector failed to scrape.\n expr:\
\ node_textfile_scrape_error{job=\"node-exporter\"} == 1\n labels:\n severity:\
\ warning\n - alert: NodeClockSkewDetected\n annotations:\n description:\
\ Clock at {{ $labels.instance }} is out of sync by more than 0.05s.\n \
\ Ensure NTP is configured correctly on this host.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected\n\
\ summary: Clock skew detected.\n expr: |-\n (\n node_timex_offset_seconds{job=\"\
node-exporter\"} > 0.05\n and\n deriv(node_timex_offset_seconds{job=\"\
node-exporter\"}[5m]) >= 0\n )\n or\n (\n node_timex_offset_seconds{job=\"\
node-exporter\"} < -0.05\n and\n deriv(node_timex_offset_seconds{job=\"\
node-exporter\"}[5m]) <= 0\n )\n for: 10m\n labels:\n severity:\
\ warning\n - alert: NodeClockNotSynchronising\n annotations:\n description:\
\ Clock at {{ $labels.instance }} is not synchronising. Ensure NTP\n is\
\ configured on this host.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising\n\
\ summary: Clock not synchronising.\n expr: |-\n min_over_time(node_timex_sync_status{job=\"\
node-exporter\"}[5m]) == 0\n and\n node_timex_maxerror_seconds{job=\"\
node-exporter\"} >= 16\n for: 10m\n labels:\n severity: warning\n \
\ - alert: NodeRAIDDegraded\n annotations:\n description: RAID array '{{\
\ $labels.device }}' at {{ $labels.instance }} is\n in degraded state due\
\ to one or more disks failures. Number of spare drives\n is insufficient\
\ to fix issue automatically.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded\n\
\ summary: RAID Array is degraded.\n expr: node_md_disks_required{job=\"\
node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\
}\n - ignoring (state) (node_md_disks{state=\"active\",job=\"node-exporter\"\
,device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\
})\n > 0\n for: 15m\n labels:\n severity: critical\n - alert:\
\ NodeRAIDDiskFailure\n annotations:\n description: At least one device\
\ in RAID array at {{ $labels.instance }} failed.\n Array '{{ $labels.device\
\ }}' needs attention and possibly a disk swap.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure\n\
\ summary: Failed device in RAID array.\n expr: node_md_disks{state=\"\
failed\",job=\"node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\
}\n > 0\n labels:\n severity: warning\n - alert: NodeFileDescriptorLimit\n\
\ annotations:\n description: File descriptors limit at {{ $labels.instance\
\ }} is currently at\n {{ printf \"%.2f\" $value }}%.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit\n\
\ summary: Kernel is predicted to exhaust file descriptors limit soon.\n\
\ expr: |-\n (\n node_filefd_allocated{job=\"node-exporter\"} *\
\ 100 / node_filefd_maximum{job=\"node-exporter\"} > 70\n )\n for: 15m\n\
\ labels:\n severity: warning\n - alert: NodeFileDescriptorLimit\n \
\ annotations:\n description: File descriptors limit at {{ $labels.instance\
\ }} is currently at\n {{ printf \"%.2f\" $value }}%.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit\n\
\ summary: Kernel is predicted to exhaust file descriptors limit soon.\n\
\ expr: |-\n (\n node_filefd_allocated{job=\"node-exporter\"} *\
\ 100 / node_filefd_maximum{job=\"node-exporter\"} > 90\n )\n for: 15m\n\
\ labels:\n severity: critical\n - alert: NodeCPUHighUsage\n annotations:\n\
\ description: |\n CPU usage at {{ $labels.instance }} has been above\
\ 90% for the last 15 minutes, is currently at {{ printf \"%.2f\" $value }}%.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage\n\
\ summary: High CPU usage.\n expr: sum without(mode) (avg without (cpu)\
\ (rate(node_cpu_seconds_total{job=\"node-exporter\",\n mode!~\"idle|iowait\"\
}[2m]))) * 100 > 90\n for: 15m\n labels:\n severity: info\n - alert:\
\ NodeSystemSaturation\n annotations:\n description: |\n System\
\ load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes,\
\ is currently at {{ printf \"%.2f\" $value }}.\n This might indicate this\
\ instance resources saturation and can cause it becoming unresponsive.\n \
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation\n\
\ summary: System saturated, load per core is very high.\n expr: |-\n\
\ node_load1{job=\"node-exporter\"}\n / count without (cpu, mode) (node_cpu_seconds_total{job=\"\
node-exporter\", mode=\"idle\"}) > 2\n for: 15m\n labels:\n severity:\
\ warning\n - alert: NodeMemoryMajorPagesFaults\n annotations:\n description:\
\ |\n Memory major pages are occurring at very high rate at {{ $labels.instance\
\ }}, 500 major page faults per second for the last 15 minutes, is currently at\
\ {{ printf \"%.2f\" $value }}.\n Please check that there is enough memory\
\ available at this instance.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults\n\
\ summary: Memory major page faults are occurring at very high rate.\n \
\ expr: rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m]) > 500\n for:\
\ 15m\n labels:\n severity: warning\n - alert: NodeMemoryHighUtilization\n\
\ annotations:\n description: |\n Memory is filling up at {{ $labels.instance\
\ }}, has been above 90% for the last 15 minutes, is currently at {{ printf \"\
%.2f\" $value }}%.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization\n\
\ summary: Host is running out of memory.\n expr: 100 - (node_memory_MemAvailable_bytes{job=\"\
node-exporter\"} / node_memory_MemTotal_bytes{job=\"node-exporter\"}\n *\
\ 100) > 90\n for: 15m\n labels:\n severity: warning\n - alert: NodeDiskIOSaturation\n\
\ annotations:\n description: |\n Disk IO queue (aqu-sq) is high\
\ on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the\
\ last 30 minutes, is currently at {{ printf \"%.2f\" $value }}.\n This\
\ symptom might indicate disk saturation.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation\n\
\ summary: Disk IO queue is high.\n expr: rate(node_disk_io_time_weighted_seconds_total{job=\"\
node-exporter\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\
}[5m])\n > 10\n for: 30m\n labels:\n severity: warning\n - alert:\
\ NodeSystemdServiceFailed\n annotations:\n description: Systemd service\
\ {{ $labels.name }} has entered failed state at\n {{ $labels.instance\
\ }}\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed\n\
\ summary: Systemd service has entered failed state.\n expr: node_systemd_unit_state{job=\"\
node-exporter\", state=\"failed\"} == 1\n for: 5m\n labels:\n severity:\
\ warning\n - alert: NodeSystemdServiceCrashlooping\n annotations:\n \
\ description: Systemd service {{ $labels.name }} has being restarted too many\n\
\ times at {{ $labels.instance }} for the last 15 minutes. Please check\
\ if service\n is crash looping.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicecrashlooping\n\
\ summary: Systemd service keeps restaring, possibly crash looping.\n \
\ expr: increase(node_systemd_service_restart_total{job=\"node-exporter\"}[5m])\
\ >\n 2\n for: 15m\n labels:\n severity: warning\n - alert: NodeBondingDegraded\n\
\ annotations:\n description: Bonding interface {{ $labels.master }} on\
\ {{ $labels.instance }}\n is in degraded state due to one or more slave\
\ failures.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded\n\
\ summary: Bonding interface is degraded.\n expr: (node_bonding_slaves{job=\"\
node-exporter\"} - node_bonding_active{job=\"node-exporter\"})\n != 0\n \
\ for: 5m\n labels:\n severity: warning\n"
monitoring-kube-prometheus-stack-node-exporter.rules-501fd5f0-0366-455a-80cc-5e208856f211.yaml: "groups:\n\
- name: node-exporter.rules\n rules:\n - expr: |-\n count without (cpu,\
\ mode) (\n node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"\
}\n )\n record: instance:node_num_cpu:sum\n - expr: |-\n 1 - avg\
\ without (cpu) (\n sum without (mode) (rate(node_cpu_seconds_total{job=\"\
node-exporter\", mode=~\"idle|iowait|steal\"}[5m]))\n )\n record: instance:node_cpu_utilisation:rate5m\n\
\ - expr: |-\n (\n node_load1{job=\"node-exporter\"}\n /\n \
\ instance:node_num_cpu:sum{job=\"node-exporter\"}\n )\n record:\
\ instance:node_load1_per_cpu:ratio\n - expr: |-\n 1 - (\n (\n \
\ node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n or\n\
\ (\n node_memory_Buffers_bytes{job=\"node-exporter\"}\n \
\ +\n node_memory_Cached_bytes{job=\"node-exporter\"}\n \
\ +\n node_memory_MemFree_bytes{job=\"node-exporter\"}\n\
\ +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n \
\ )\n )\n /\n node_memory_MemTotal_bytes{job=\"node-exporter\"\
}\n )\n record: instance:node_memory_utilisation:ratio\n - expr: rate(node_vmstat_pgmajfault{job=\"\
node-exporter\"}[5m])\n record: instance:node_vmstat_pgmajfault:rate5m\n -\
\ expr: rate(node_disk_io_time_seconds_total{job=\"node-exporter\", device=~\"\
(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[5m])\n\
\ record: instance_device:node_disk_io_time_seconds:rate5m\n - expr: rate(node_disk_io_time_weighted_seconds_total{job=\"\
node-exporter\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"\
}[5m])\n record: instance_device:node_disk_io_time_weighted_seconds:rate5m\n\
\ - expr: |-\n sum without (device) (\n rate(node_network_receive_bytes_total{job=\"\
node-exporter\", device!=\"lo\"}[5m])\n )\n record: instance:node_network_receive_bytes_excluding_lo:rate5m\n\
\ - expr: |-\n sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"\
node-exporter\", device!=\"lo\"}[5m])\n )\n record: instance:node_network_transmit_bytes_excluding_lo:rate5m\n\
\ - expr: |-\n sum without (device) (\n rate(node_network_receive_drop_total{job=\"\
node-exporter\", device!=\"lo\"}[5m])\n )\n record: instance:node_network_receive_drop_excluding_lo:rate5m\n\
\ - expr: |-\n sum without (device) (\n rate(node_network_transmit_drop_total{job=\"\
node-exporter\", device!=\"lo\"}[5m])\n )\n record: instance:node_network_transmit_drop_excluding_lo:rate5m\n\
\ - expr: |-\n sum without (device) (\n rate(node_network_receive_bytes_total{job=\"\
node-exporter\", device!~\"lo|veth.+\"}[5m])\n )\n record: instance:node_network_receive_bytes_physical:rate5m\n\
\ - expr: |-\n sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"\
node-exporter\", device!~\"lo|veth.+\"}[5m])\n )\n record: instance:node_network_transmit_bytes_physical:rate5m\n\
\ - expr: |-\n sum without (device) (\n rate(node_network_receive_drop_total{job=\"\
node-exporter\", device!~\"lo|veth.+\"}[5m])\n )\n record: instance:node_network_receive_drop_physical:rate5m\n\
\ - expr: |-\n sum without (device) (\n rate(node_network_transmit_drop_total{job=\"\
node-exporter\", device!~\"lo|veth.+\"}[5m])\n )\n record: instance:node_network_transmit_drop_physical:rate5m\n"
monitoring-kube-prometheus-stack-node-network-6268a5b5-2d1a-4ed0-b8b3-a03a12b3390d.yaml: "groups:\n\
- name: node-network\n rules:\n - alert: NodeNetworkInterfaceFlapping\n annotations:\n\
\ description: Network interface \"{{ $labels.device }}\" changing its up\
\ status\n often on node-exporter {{ $labels.namespace }}/{{ $labels.pod\
\ }}\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping\n\
\ summary: Network interface is often changing its status\n expr: changes(node_network_up{job=\"\
node-exporter\",device!~\"veth.+\"}[2m]) > 2\n for: 2m\n labels:\n \
\ severity: warning\n"
monitoring-kube-prometheus-stack-node.rules-a9b43f41-87eb-4fea-856f-1f513c9d1ee3.yaml: "groups:\n\
- name: node.rules\n rules:\n - expr: |-\n topk by (cluster, namespace,\
\ pod) (1,\n max by (cluster, node, namespace, pod) (\n label_replace(kube_pod_info{job=\"\
kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n ))\n\
\ record: 'node_namespace_pod:kube_pod_info:'\n - expr: |-\n count by\
\ (cluster, node) (\n node_cpu_seconds_total{mode=\"idle\",job=\"node-exporter\"\
}\n * on (cluster, namespace, pod) group_left(node)\n topk by (cluster,\
\ namespace, pod) (1, node_namespace_pod:kube_pod_info:)\n )\n record:\
\ node:node_num_cpu:sum\n - expr: |-\n sum(\n node_memory_MemAvailable_bytes{job=\"\
node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"\
} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"\
node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n\
\ )\n ) by (cluster)\n record: :node_memory_MemAvailable_bytes:sum\n\
\ - expr: |-\n avg by (cluster, node) (\n sum without (mode) (\n\
\ rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"\
steal\",job=\"node-exporter\"}[5m])\n )\n )\n record: node:node_cpu_utilization:ratio_rate5m\n\
\ - expr: |-\n avg by (cluster) (\n node:node_cpu_utilization:ratio_rate5m\n\
\ )\n record: cluster:node_cpu:ratio_rate5m\n"
monitoring-kube-prometheus-stack-prometheus-7347b3a7-f0fa-4d43-bd9e-c3f35a7087b9.yaml: "groups:\n\
- name: prometheus\n rules:\n - alert: PrometheusBadConfig\n annotations:\n\
\ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed\
\ to\n reload its configuration.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig\n\
\ summary: Failed Prometheus configuration reload.\n expr: |-\n #\
\ Without max_over_time, failed scrapes could create false negatives, see\n \
\ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for\
\ details.\n max_over_time(prometheus_config_last_reload_successful{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]) == 0\n for:\
\ 10m\n labels:\n severity: critical\n - alert: PrometheusSDRefreshFailure\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ has failed to\n refresh SD with mechanism {{$labels.mechanism}}.\n \
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure\n\
\ summary: Failed Prometheus SD refresh.\n expr: increase(prometheus_sd_refresh_failures_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[10m])\n > 0\n\
\ for: 20m\n labels:\n severity: warning\n - alert: PrometheusKubernetesListWatchFailures\n\
\ annotations:\n description: Kubernetes service discovery of Prometheus\
\ {{$labels.namespace}}/{{$labels.pod}}\n is experiencing {{ printf \"\
%.0f\" $value }} failures with LIST/WATCH requests\n to the Kubernetes\
\ API in the last 5 minutes.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuskuberneteslistwatchfailures\n\
\ summary: Requests in Kubernetes SD are failing.\n expr: increase(prometheus_sd_kubernetes_failures_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusNotificationQueueRunningFull\n\
\ annotations:\n description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}\n\
\ is running full.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull\n\
\ summary: Prometheus alert notification queue predicted to run full in less\
\ than\n 30m.\n expr: |-\n # Without min_over_time, failed scrapes\
\ could create false negatives, see\n # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\
\ for details.\n (\n predict_linear(prometheus_notifications_queue_length{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m], 60 * 30)\n \
\ >\n min_over_time(prometheus_notifications_queue_capacity{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n )\n \
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers\n\
\ annotations:\n description: '{{ printf \"%.1f\" $value }}% of alerts\
\ sent by Prometheus {{$labels.namespace}}/{{$labels.pod}}\n to Alertmanager\
\ {{$labels.alertmanager}} were affected by errors.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers\n\
\ summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager\n\
\ were affected by errors.\n expr: |-\n (\n rate(prometheus_notifications_errors_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n /\n \
\ rate(prometheus_notifications_sent_total{job=\"kube-prometheus-stack-prometheus\"\
,namespace=\"monitoring\"}[5m])\n )\n * 100\n > 1\n for: 15m\n\
\ labels:\n severity: warning\n - alert: PrometheusNotConnectedToAlertmanagers\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ is not connected\n to any Alertmanagers.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers\n\
\ summary: Prometheus is not connected to any Alertmanagers.\n expr: |-\n\
\ # Without max_over_time, failed scrapes could create false negatives, see\n\
\ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\
\ for details.\n max_over_time(prometheus_notifications_alertmanagers_discovered{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]) < 1\n for:\
\ 10m\n labels:\n severity: warning\n - alert: PrometheusTSDBReloadsFailing\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ has detected {{$value\n | humanize}} reload failures over the last 3h.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing\n\
\ summary: Prometheus has issues reloading blocks from disk.\n expr: increase(prometheus_tsdb_reloads_failures_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[3h])\n > 0\n\
\ for: 4h\n labels:\n severity: warning\n - alert: PrometheusTSDBCompactionsFailing\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ has detected {{$value\n | humanize}} compaction failures over the last\
\ 3h.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing\n\
\ summary: Prometheus has issues compacting blocks.\n expr: increase(prometheus_tsdb_compactions_failed_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[3h])\n > 0\n\
\ for: 4h\n labels:\n severity: warning\n - alert: PrometheusNotIngestingSamples\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ is not ingesting\n samples.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples\n\
\ summary: Prometheus is not ingesting samples.\n expr: |-\n (\n\
\ sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])) <= 0\n \
\ and\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}) > 0\n or\n\
\ sum without(rule_group) (prometheus_rule_group_rules{job=\"kube-prometheus-stack-prometheus\"\
,namespace=\"monitoring\"}) > 0\n )\n )\n for: 10m\n labels:\n\
\ severity: warning\n - alert: PrometheusDuplicateTimestamps\n annotations:\n\
\ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping\
\ {{\n printf \"%.4g\" $value }} samples/s with different values but duplicated\
\ timestamp.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps\n\
\ summary: Prometheus is dropping samples with duplicate timestamps.\n \
\ expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\
\ for: 10m\n labels:\n severity: warning\n - alert: PrometheusOutOfOrderTimestamps\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ is dropping {{\n printf \"%.4g\" $value }} samples/s with timestamps\
\ arriving out of order.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps\n\
\ summary: Prometheus drops samples with out-of-order timestamps.\n expr:\
\ rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"kube-prometheus-stack-prometheus\"\
,namespace=\"monitoring\"}[5m])\n > 0\n for: 10m\n labels:\n severity:\
\ warning\n - alert: PrometheusRemoteStorageFailures\n annotations:\n \
\ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send\n\
\ {{ printf \"%.1f\" $value }}% of the samples to {{ $labels.remote_name}}:{{\n\
\ $labels.url }}\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures\n\
\ summary: Prometheus fails to send samples to remote storage.\n expr:\
\ |-\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]))\n /\n\
\ (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]))\n +\n\
\ (rate(prometheus_remote_storage_succeeded_samples_total{job=\"kube-prometheus-stack-prometheus\"\
,namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m]))\n )\n\
\ )\n * 100\n > 1\n for: 15m\n labels:\n severity: critical\n\
\ - alert: PrometheusRemoteWriteBehind\n annotations:\n description:\
\ Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is\n {{\
\ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url\n\
\ }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind\n\
\ summary: Prometheus remote write is behind.\n expr: |-\n # Without\
\ max_over_time, failed scrapes could create false negatives, see\n # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\
\ for details.\n (\n max_over_time(prometheus_remote_storage_queue_highest_timestamp_seconds{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n -\n \
\ max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n )\n \
\ > 120\n for: 15m\n labels:\n severity: critical\n - alert: PrometheusRemoteWriteDesiredShards\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ remote write desired\n shards calculation wants to run {{ $value }} shards\
\ for queue {{ $labels.remote_name}}:{{\n $labels.url }}, which is more\
\ than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"\
%s\",job=\"kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}`\n \
\ $labels.instance | query | first | value }}.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards\n\
\ summary: Prometheus remote write desired shards calculation wants to run\
\ more\n than configured max shards.\n expr: |-\n # Without max_over_time,\
\ failed scrapes could create false negatives, see\n # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0\
\ for details.\n (\n max_over_time(prometheus_remote_storage_shards_desired{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n >\n \
\ max_over_time(prometheus_remote_storage_shards_max{job=\"kube-prometheus-stack-prometheus\"\
,namespace=\"monitoring\"}[5m])\n )\n for: 15m\n labels:\n severity:\
\ warning\n - alert: PrometheusRuleFailures\n annotations:\n description:\
\ Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to\n evaluate\
\ {{ printf \"%.0f\" $value }} rules in the last 5m.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures\n\
\ summary: Prometheus is failing rule evaluations.\n expr: increase(prometheus_rule_evaluation_failures_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\
\ for: 15m\n labels:\n severity: critical\n - alert: PrometheusMissingRuleEvaluations\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ has missed {{\n printf \"%.0f\" $value }} rule group evaluations in the\
\ last 5m.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations\n\
\ summary: Prometheus is missing rule evaluations due to slow rule group\
\ evaluation.\n expr: increase(prometheus_rule_group_iterations_missed_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusTargetLimitHit\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ has dropped {{\n printf \"%.0f\" $value }} targets because the number\
\ of targets exceeded the\n configured target_limit.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit\n\
\ summary: Prometheus has dropped targets because some scrape configs have\
\ exceeded\n the targets limit.\n expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusLabelLimitHit\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ has dropped {{\n printf \"%.0f\" $value }} targets because some samples\
\ exceeded the configured\n label_limit, label_name_length_limit or label_value_length_limit.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit\n\
\ summary: Prometheus has dropped targets because some scrape configs have\
\ exceeded\n the labels limit.\n expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusScrapeBodySizeLimitHit\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ has failed {{\n printf \"%.0f\" $value }} scrapes in the last 5m because\
\ some targets exceeded\n the configured body_size_limit.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit\n\
\ summary: Prometheus has dropped some targets that exceeded body size limit.\n\
\ expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusScrapeSampleLimitHit\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ has failed {{\n printf \"%.0f\" $value }} scrapes in the last 5m because\
\ some targets exceeded\n the configured sample_limit.\n runbook_url:\
\ https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit\n\
\ summary: Prometheus has failed scrapes that have exceeded the configured\
\ sample\n limit.\n expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusTargetSyncFailure\n\
\ annotations:\n description: '{{ printf \"%.0f\" $value }} targets in\
\ Prometheus {{$labels.namespace}}/{{$labels.pod}}\n have failed to sync\
\ because invalid configuration was supplied.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure\n\
\ summary: Prometheus has failed to sync targets.\n expr: increase(prometheus_target_sync_failed_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[30m])\n > 0\n\
\ for: 5m\n labels:\n severity: critical\n - alert: PrometheusHighQueryLoad\n\
\ annotations:\n description: Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ query API has\n less than 20% available capacity in its query engine\
\ for the last 15 minutes.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload\n\
\ summary: Prometheus is reaching its maximum capacity serving concurrent\
\ requests.\n expr: avg_over_time(prometheus_engine_queries{job=\"kube-prometheus-stack-prometheus\"\
,namespace=\"monitoring\"}[5m])\n / max_over_time(prometheus_engine_queries_concurrent_max{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\"}[5m])\n > 0.8\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusErrorSendingAlertsToAnyAlertmanager\n\
\ annotations:\n description: '{{ printf \"%.1f\" $value }}% minimum errors\
\ while sending alerts\n from Prometheus {{$labels.namespace}}/{{$labels.pod}}\
\ to any Alertmanager.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager\n\
\ summary: Prometheus encounters more than 3% errors sending alerts to any\
\ Alertmanager.\n expr: |-\n min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"\
kube-prometheus-stack-prometheus\",namespace=\"monitoring\",alertmanager!~``}[5m])\n\
\ /\n rate(prometheus_notifications_sent_total{job=\"kube-prometheus-stack-prometheus\"\
,namespace=\"monitoring\",alertmanager!~``}[5m])\n )\n * 100\n \
\ > 3\n for: 15m\n labels:\n severity: critical\n"
monitoring-kube-prometheus-stack-prometheus-operator-6e123f81-1bd4-4e2f-b8ae-c1287bf5cd37.yaml: "groups:\n\
- name: prometheus-operator\n rules:\n - alert: PrometheusOperatorListErrors\n\
\ annotations:\n description: Errors while performing List operations\
\ in controller {{$labels.controller}}\n in {{$labels.namespace}} namespace.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors\n\
\ summary: Errors while performing list operations in controller.\n expr:\
\ (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"\
kube-prometheus-stack-operator\",namespace=\"monitoring\"}[10m]))\n / sum\
\ by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job=\"\
kube-prometheus-stack-operator\",namespace=\"monitoring\"}[10m])))\n > 0.4\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusOperatorWatchErrors\n\
\ annotations:\n description: Errors while performing watch operations\
\ in controller {{$labels.controller}}\n in {{$labels.namespace}} namespace.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors\n\
\ summary: Errors while performing watch operations in controller.\n expr:\
\ (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"\
kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m]))\n / sum\
\ by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job=\"\
kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m])))\n > 0.4\n\
\ for: 15m\n labels:\n severity: warning\n - alert: PrometheusOperatorSyncFailed\n\
\ annotations:\n description: Controller {{ $labels.controller }} in {{\
\ $labels.namespace }}\n namespace fails to reconcile {{ $value }} objects.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed\n\
\ summary: Last controller reconciliation failed\n expr: min_over_time(prometheus_operator_syncs{status=\"\
failed\",job=\"kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m])\n\
\ > 0\n for: 10m\n labels:\n severity: warning\n - alert: PrometheusOperatorReconcileErrors\n\
\ annotations:\n description: '{{ $value | humanizePercentage }} of reconciling\
\ operations failed\n for {{ $labels.controller }} controller in {{ $labels.namespace\
\ }} namespace.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors\n\
\ summary: Errors while reconciling objects.\n expr: (sum by (cluster,controller,namespace)\
\ (rate(prometheus_operator_reconcile_errors_total{job=\"kube-prometheus-stack-operator\"\
,namespace=\"monitoring\"}[5m])))\n / (sum by (cluster,controller,namespace)\
\ (rate(prometheus_operator_reconcile_operations_total{job=\"kube-prometheus-stack-operator\"\
,namespace=\"monitoring\"}[5m])))\n > 0.1\n for: 10m\n labels:\n \
\ severity: warning\n - alert: PrometheusOperatorStatusUpdateErrors\n annotations:\n\
\ description: '{{ $value | humanizePercentage }} of status update operations\n\
\ failed for {{ $labels.controller }} controller in {{ $labels.namespace\
\ }}\n namespace.'\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorstatusupdateerrors\n\
\ summary: Errors while updating objects status.\n expr: (sum by (cluster,controller,namespace)\
\ (rate(prometheus_operator_status_update_errors_total{job=\"kube-prometheus-stack-operator\"\
,namespace=\"monitoring\"}[5m])))\n / (sum by (cluster,controller,namespace)\
\ (rate(prometheus_operator_status_update_operations_total{job=\"kube-prometheus-stack-operator\"\
,namespace=\"monitoring\"}[5m])))\n > 0.1\n for: 10m\n labels:\n \
\ severity: warning\n - alert: PrometheusOperatorNodeLookupErrors\n annotations:\n\
\ description: Errors while reconciling Prometheus in {{ $labels.namespace\
\ }}\n Namespace.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors\n\
\ summary: Errors while reconciling Prometheus.\n expr: rate(prometheus_operator_node_address_lookup_errors_total{job=\"\
kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m])\n > 0.1\n\
\ for: 10m\n labels:\n severity: warning\n - alert: PrometheusOperatorNotReady\n\
\ annotations:\n description: Prometheus operator in {{ $labels.namespace\
\ }} namespace isn't\n ready to reconcile {{ $labels.controller }} resources.\n\
\ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready\n\
\ summary: Prometheus operator not ready\n expr: min by (cluster,controller,namespace)\
\ (max_over_time(prometheus_operator_ready{job=\"kube-prometheus-stack-operator\"\
,namespace=\"monitoring\"}[5m])\n == 0)\n for: 5m\n labels:\n \
\ severity: warning\n - alert: PrometheusOperatorRejectedResources\n annotations:\n\
\ description: Prometheus operator in {{ $labels.namespace }} namespace rejected\n\
\ {{ printf \"%0.0f\" $value }} {{ $labels.controller }}/{{ $labels.resource\
\ }}\n resources.\n runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources\n\
\ summary: Resources rejected by Prometheus operator\n expr: min_over_time(prometheus_operator_managed_resources{state=\"\
rejected\",job=\"kube-prometheus-stack-operator\",namespace=\"monitoring\"}[5m])\n\
\ > 0\n for: 5m\n labels:\n severity: warning\n"
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/managed-by: prometheus-operator
managed-by: prometheus-operator
prometheus-name: kube-prometheus-stack-prometheus
name: prometheus-kube-prometheus-stack-prometheus-rulefiles-0
namespace: monitoring
ownerReferences:
- apiVersion: monitoring.coreos.com/v1
blockOwnerDeletion: true
controller: true
kind: Prometheus
name: kube-prometheus-stack-prometheus
uid: f0355616-4bfa-4409-8b5f-c1c815ee7a2a