add alerts

This commit is contained in:
Kopatz
2024-04-14 15:54:02 +02:00
parent b7bd502525
commit 485d452dd7
5 changed files with 345 additions and 100 deletions

View File

@@ -1,97 +0,0 @@
apiVersion: 1
groups:
- orgId: 1
name: default
folder: ALARM
interval: 5m
rules:
- uid: ddin0kv0wnj0gd
title: Systemd Units State
condition: B
data:
- refId: D
relativeTimeRange:
from: 86400
to: 0
datasourceUid: PBFE396EC0B189D67
model:
datasource:
type: prometheus
uid: PBFE396EC0B189D67
expr: node_systemd_units{instance="127.0.0.1:9001",job="scrapema",state="failed"}
format: time_series
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: Failed
maxDataPoints: 43200
refId: D
step: 240
- refId: A
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: D
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: A
settings:
mode: dropNN
type: reduce
- refId: B
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
dashboardUid: rYdddlPWk
panelId: 298
noDataState: NoData
execErrState: Error
for: 5m
annotations:
__dashboardUid__: rYdddlPWk
__panelId__: "298"
labels: {}
isPaused: false

View File

@@ -19,8 +19,8 @@ in
}; };
provision.alerting.contactPoints.path = config.age.secrets.grafana-contact-points.path; provision.alerting.contactPoints.path = config.age.secrets.grafana-contact-points.path;
provision.alerting.policies.path = ./grafana-dashboards/notification-policies.yml; provision.alerting.policies.path = ./grafana/notification-policies.yml;
provision.alerting.templates.path = ./grafana-dashboards/alerts.yml; provision.alerting.templates.path = ./grafana/alerts.yml;
provision.datasources.settings = { provision.datasources.settings = {
datasources = datasources =
[ [
@@ -34,7 +34,7 @@ in
}; };
provision.dashboards.settings.providers = [{ provision.dashboards.settings.providers = [{
name = "provisioned-dashboards"; name = "provisioned-dashboards";
options.path = ./grafana-dashboards; options.path = ./grafana/dashboards;
}]; }];
}; };

View File

@@ -0,0 +1,342 @@
apiVersion: 1
groups:
- orgId: 1
name: default
folder: ALARM
interval: 5m
rules:
- uid: ddin0kv0wnj0gd
title: Systemd Units State
condition: B
data:
- refId: D
relativeTimeRange:
from: 86400
to: 0
datasourceUid: PBFE396EC0B189D67
model:
datasource:
type: prometheus
uid: PBFE396EC0B189D67
expr: node_systemd_units{instance="127.0.0.1:9001",job="scrapema",state="failed"}
format: time_series
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: Failed
maxDataPoints: 43200
refId: D
step: 240
- refId: A
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: D
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: A
settings:
mode: dropNN
type: reduce
- refId: B
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
dashboardUid: rYdddlPWk
panelId: 298
noDataState: NoData
execErrState: Error
for: 5m
annotations:
__dashboardUid__: rYdddlPWk
__panelId__: "298"
labels: {}
isPaused: false
- uid: adin55cdu3ocga
title: Memory Basic
condition: G
data:
- refId: A
relativeTimeRange:
from: 86400
to: 0
datasourceUid: PBFE396EC0B189D67
model:
datasource:
type: prometheus
uid: PBFE396EC0B189D67
expr: node_memory_MemTotal_bytes{instance="127.0.0.1:9001",job="scrapema"}
format: time_series
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: RAM Total
maxDataPoints: 43200
refId: A
step: 240
- refId: B
relativeTimeRange:
from: 86400
to: 0
datasourceUid: PBFE396EC0B189D67
model:
datasource:
type: prometheus
uid: PBFE396EC0B189D67
expr: node_memory_MemTotal_bytes{instance="127.0.0.1:9001",job="scrapema"} - node_memory_MemFree_bytes{instance="127.0.0.1:9001",job="scrapema"} - (node_memory_Cached_bytes{instance="127.0.0.1:9001",job="scrapema"} + node_memory_Buffers_bytes{instance="127.0.0.1:9001",job="scrapema"} + node_memory_SReclaimable_bytes{instance="127.0.0.1:9001",job="scrapema"})
format: time_series
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: RAM Used
maxDataPoints: 43200
refId: B
step: 240
- refId: C
relativeTimeRange:
from: 86400
to: 0
datasourceUid: PBFE396EC0B189D67
model:
datasource:
type: prometheus
uid: PBFE396EC0B189D67
expr: node_memory_Cached_bytes{instance="127.0.0.1:9001",job="scrapema"} + node_memory_Buffers_bytes{instance="127.0.0.1:9001",job="scrapema"} + node_memory_SReclaimable_bytes{instance="127.0.0.1:9001",job="scrapema"}
format: time_series
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: RAM Cache + Buffer
maxDataPoints: 43200
refId: C
step: 240
- refId: D
relativeTimeRange:
from: 86400
to: 0
datasourceUid: PBFE396EC0B189D67
model:
datasource:
type: prometheus
uid: PBFE396EC0B189D67
expr: node_memory_MemFree_bytes{instance="127.0.0.1:9001",job="scrapema"}
format: time_series
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: RAM Free
maxDataPoints: 43200
refId: D
step: 240
- refId: E
relativeTimeRange:
from: 86400
to: 0
datasourceUid: PBFE396EC0B189D67
model:
datasource:
type: prometheus
uid: PBFE396EC0B189D67
expr: (node_memory_SwapTotal_bytes{instance="127.0.0.1:9001",job="scrapema"} - node_memory_SwapFree_bytes{instance="127.0.0.1:9001",job="scrapema"})
format: time_series
interval: ""
intervalFactor: 1
intervalMs: 15000
legendFormat: SWAP Used
maxDataPoints: 43200
refId: E
step: 240
- refId: F
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- F
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: D
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: F
settings:
mode: dropNN
type: reduce
- refId: G
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1e+09
- 0
type: lt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: F
intervalMs: 1000
maxDataPoints: 43200
refId: G
type: threshold
dashboardUid: rYdddlPWk
panelId: 78
noDataState: NoData
execErrState: Error
for: 5m
annotations:
__dashboardUid__: rYdddlPWk
__panelId__: "78"
labels: {}
isPaused: false
- uid: ddipptubkwe80f
title: Panel Title
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: PBFE396EC0B189D67
model:
datasource:
type: prometheus
uid: PBFE396EC0B189D67
editorMode: code
exemplar: false
expr: time() - container_last_seen{name=~".+"}
format: time_series
instant: false
interval: ""
intervalMs: 15000
legendFormat: '{{name}}'
maxDataPoints: 43200
range: true
refId: A
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 60
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: B
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
dashboardUid: cdijlo7pmmby8c
panelId: 2
noDataState: NoData
execErrState: Error
for: 5m
annotations:
__dashboardUid__: cdijlo7pmmby8c
__panelId__: "2"
labels: {}
isPaused: false