fix(monitoring): correct alert rule pipeline to A→B(reduce)→C(threshold)
Grafana threshold expression requires a scalar input, not a raw time series. Added explicit reduce step (type: reduce, reducer: last) as refId B between the Prometheus query (A) and the threshold check (C). All 4 rules updated: CrashLoopBackOff, Disco >80%, RAM >85%, Pod Failed. condition field changed from B → C on each rule.
This commit is contained in:
@@ -43,7 +43,7 @@ data:
|
||||
|
||||
- uid: homelab-crashloop
|
||||
title: "Pod CrashLoopBackOff"
|
||||
condition: B
|
||||
condition: C
|
||||
for: 1m
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
@@ -66,18 +66,25 @@ data:
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
type: reduce
|
||||
refId: B
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: ""
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
refId: C
|
||||
expression: B
|
||||
conditions:
|
||||
- type: query
|
||||
evaluator: {params: [0], type: gt}
|
||||
operator: {type: and}
|
||||
query: {params: [A]}
|
||||
reducer: {params: [], type: last}
|
||||
- evaluator: {params: [0], type: gt}
|
||||
|
||||
- uid: homelab-disk-high
|
||||
title: "Disco > 80%"
|
||||
condition: B
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
@@ -100,18 +107,25 @@ data:
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
type: reduce
|
||||
refId: B
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: ""
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
refId: C
|
||||
expression: B
|
||||
conditions:
|
||||
- type: query
|
||||
evaluator: {params: [0.8], type: gt}
|
||||
operator: {type: and}
|
||||
query: {params: [A]}
|
||||
reducer: {params: [], type: last}
|
||||
- evaluator: {params: [0.8], type: gt}
|
||||
|
||||
- uid: homelab-ram-high
|
||||
title: "RAM > 85%"
|
||||
condition: B
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
@@ -134,18 +148,25 @@ data:
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
type: reduce
|
||||
refId: B
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: ""
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
refId: C
|
||||
expression: B
|
||||
conditions:
|
||||
- type: query
|
||||
evaluator: {params: [0.85], type: gt}
|
||||
operator: {type: and}
|
||||
query: {params: [A]}
|
||||
reducer: {params: [], type: last}
|
||||
- evaluator: {params: [0.85], type: gt}
|
||||
|
||||
- uid: homelab-pod-failed
|
||||
title: "Pod Failed/Unknown"
|
||||
condition: B
|
||||
condition: C
|
||||
for: 3m
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
@@ -168,11 +189,18 @@ data:
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
type: reduce
|
||||
refId: B
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: ""
|
||||
- refId: C
|
||||
relativeTimeRange: {from: 0, to: 0}
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
type: threshold
|
||||
refId: C
|
||||
expression: B
|
||||
conditions:
|
||||
- type: query
|
||||
evaluator: {params: [0], type: gt}
|
||||
operator: {type: and}
|
||||
query: {params: [A]}
|
||||
reducer: {params: [], type: last}
|
||||
- evaluator: {params: [0], type: gt}
|
||||
|
||||
Reference in New Issue
Block a user