fix(monitoring): correct alert rule pipeline to A→B(reduce)→C(threshold)

Grafana threshold expression requires a scalar input, not a raw time
series. Added explicit reduce step (type: reduce, reducer: last) as
refId B between the Prometheus query (A) and the threshold check (C).

All 4 rules updated: CrashLoopBackOff, Disco >80%, RAM >85%, Pod Failed.
condition field changed from B → C on each rule.
This commit is contained in:
chemavx
2026-04-26 15:46:39 +00:00
parent bb64cc9e62
commit 4facdd8515
+56 -28
View File
@@ -43,7 +43,7 @@ data:
- uid: homelab-crashloop
title: "Pod CrashLoopBackOff"
condition: B
condition: C
for: 1m
noDataState: OK
execErrState: Error
@@ -66,18 +66,25 @@ data:
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
type: reduce
refId: B
expression: A
reducer: last
settings:
mode: ""
- refId: C
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: C
expression: B
conditions:
- type: query
evaluator: {params: [0], type: gt}
operator: {type: and}
query: {params: [A]}
reducer: {params: [], type: last}
- evaluator: {params: [0], type: gt}
- uid: homelab-disk-high
title: "Disco > 80%"
condition: B
condition: C
for: 5m
noDataState: NoData
execErrState: Error
@@ -100,18 +107,25 @@ data:
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
type: reduce
refId: B
expression: A
reducer: last
settings:
mode: ""
- refId: C
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: C
expression: B
conditions:
- type: query
evaluator: {params: [0.8], type: gt}
operator: {type: and}
query: {params: [A]}
reducer: {params: [], type: last}
- evaluator: {params: [0.8], type: gt}
- uid: homelab-ram-high
title: "RAM > 85%"
condition: B
condition: C
for: 5m
noDataState: NoData
execErrState: Error
@@ -134,18 +148,25 @@ data:
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
type: reduce
refId: B
expression: A
reducer: last
settings:
mode: ""
- refId: C
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: C
expression: B
conditions:
- type: query
evaluator: {params: [0.85], type: gt}
operator: {type: and}
query: {params: [A]}
reducer: {params: [], type: last}
- evaluator: {params: [0.85], type: gt}
- uid: homelab-pod-failed
title: "Pod Failed/Unknown"
condition: B
condition: C
for: 3m
noDataState: OK
execErrState: Error
@@ -168,11 +189,18 @@ data:
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
type: reduce
refId: B
expression: A
reducer: last
settings:
mode: ""
- refId: C
relativeTimeRange: {from: 0, to: 0}
datasourceUid: "-100"
model:
type: threshold
refId: C
expression: B
conditions:
- type: query
evaluator: {params: [0], type: gt}
operator: {type: and}
query: {params: [A]}
reducer: {params: [], type: last}
- evaluator: {params: [0], type: gt}