fix(monitoring): correct alert rule pipeline to A→B(reduce)→C(threshold)
Grafana threshold expression requires a scalar input, not a raw time series. Added explicit reduce step (type: reduce, reducer: last) as refId B between the Prometheus query (A) and the threshold check (C). All 4 rules updated: CrashLoopBackOff, Disco >80%, RAM >85%, Pod Failed. condition field changed from B → C on each rule.
This commit is contained in:
@@ -43,7 +43,7 @@ data:
|
|||||||
|
|
||||||
- uid: homelab-crashloop
|
- uid: homelab-crashloop
|
||||||
title: "Pod CrashLoopBackOff"
|
title: "Pod CrashLoopBackOff"
|
||||||
condition: B
|
condition: C
|
||||||
for: 1m
|
for: 1m
|
||||||
noDataState: OK
|
noDataState: OK
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
@@ -66,18 +66,25 @@ data:
|
|||||||
relativeTimeRange: {from: 0, to: 0}
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
datasourceUid: "-100"
|
datasourceUid: "-100"
|
||||||
model:
|
model:
|
||||||
type: threshold
|
type: reduce
|
||||||
refId: B
|
refId: B
|
||||||
|
expression: A
|
||||||
|
reducer: last
|
||||||
|
settings:
|
||||||
|
mode: ""
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
|
datasourceUid: "-100"
|
||||||
|
model:
|
||||||
|
type: threshold
|
||||||
|
refId: C
|
||||||
|
expression: B
|
||||||
conditions:
|
conditions:
|
||||||
- type: query
|
- evaluator: {params: [0], type: gt}
|
||||||
evaluator: {params: [0], type: gt}
|
|
||||||
operator: {type: and}
|
|
||||||
query: {params: [A]}
|
|
||||||
reducer: {params: [], type: last}
|
|
||||||
|
|
||||||
- uid: homelab-disk-high
|
- uid: homelab-disk-high
|
||||||
title: "Disco > 80%"
|
title: "Disco > 80%"
|
||||||
condition: B
|
condition: C
|
||||||
for: 5m
|
for: 5m
|
||||||
noDataState: NoData
|
noDataState: NoData
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
@@ -100,18 +107,25 @@ data:
|
|||||||
relativeTimeRange: {from: 0, to: 0}
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
datasourceUid: "-100"
|
datasourceUid: "-100"
|
||||||
model:
|
model:
|
||||||
type: threshold
|
type: reduce
|
||||||
refId: B
|
refId: B
|
||||||
|
expression: A
|
||||||
|
reducer: last
|
||||||
|
settings:
|
||||||
|
mode: ""
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
|
datasourceUid: "-100"
|
||||||
|
model:
|
||||||
|
type: threshold
|
||||||
|
refId: C
|
||||||
|
expression: B
|
||||||
conditions:
|
conditions:
|
||||||
- type: query
|
- evaluator: {params: [0.8], type: gt}
|
||||||
evaluator: {params: [0.8], type: gt}
|
|
||||||
operator: {type: and}
|
|
||||||
query: {params: [A]}
|
|
||||||
reducer: {params: [], type: last}
|
|
||||||
|
|
||||||
- uid: homelab-ram-high
|
- uid: homelab-ram-high
|
||||||
title: "RAM > 85%"
|
title: "RAM > 85%"
|
||||||
condition: B
|
condition: C
|
||||||
for: 5m
|
for: 5m
|
||||||
noDataState: NoData
|
noDataState: NoData
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
@@ -134,18 +148,25 @@ data:
|
|||||||
relativeTimeRange: {from: 0, to: 0}
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
datasourceUid: "-100"
|
datasourceUid: "-100"
|
||||||
model:
|
model:
|
||||||
type: threshold
|
type: reduce
|
||||||
refId: B
|
refId: B
|
||||||
|
expression: A
|
||||||
|
reducer: last
|
||||||
|
settings:
|
||||||
|
mode: ""
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
|
datasourceUid: "-100"
|
||||||
|
model:
|
||||||
|
type: threshold
|
||||||
|
refId: C
|
||||||
|
expression: B
|
||||||
conditions:
|
conditions:
|
||||||
- type: query
|
- evaluator: {params: [0.85], type: gt}
|
||||||
evaluator: {params: [0.85], type: gt}
|
|
||||||
operator: {type: and}
|
|
||||||
query: {params: [A]}
|
|
||||||
reducer: {params: [], type: last}
|
|
||||||
|
|
||||||
- uid: homelab-pod-failed
|
- uid: homelab-pod-failed
|
||||||
title: "Pod Failed/Unknown"
|
title: "Pod Failed/Unknown"
|
||||||
condition: B
|
condition: C
|
||||||
for: 3m
|
for: 3m
|
||||||
noDataState: OK
|
noDataState: OK
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
@@ -168,11 +189,18 @@ data:
|
|||||||
relativeTimeRange: {from: 0, to: 0}
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
datasourceUid: "-100"
|
datasourceUid: "-100"
|
||||||
model:
|
model:
|
||||||
type: threshold
|
type: reduce
|
||||||
refId: B
|
refId: B
|
||||||
|
expression: A
|
||||||
|
reducer: last
|
||||||
|
settings:
|
||||||
|
mode: ""
|
||||||
|
- refId: C
|
||||||
|
relativeTimeRange: {from: 0, to: 0}
|
||||||
|
datasourceUid: "-100"
|
||||||
|
model:
|
||||||
|
type: threshold
|
||||||
|
refId: C
|
||||||
|
expression: B
|
||||||
conditions:
|
conditions:
|
||||||
- type: query
|
- evaluator: {params: [0], type: gt}
|
||||||
evaluator: {params: [0], type: gt}
|
|
||||||
operator: {type: and}
|
|
||||||
query: {params: [A]}
|
|
||||||
reducer: {params: [], type: last}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user