Alerts

/etc/prometheus/rules/ansible_managed.rules > ansible managed alert rules

InstanceDown (1 active)

alert: InstanceDown
expr: up == 0
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 5 minutes.'
  summary: Instance {{ $labels.instance }} down

Labels	State	Active Since	Value
alertname="InstanceDown" instance="localhost:8100" job="demo" severity="critical"	firing	2022-09-12 13:25:44.318691705 +0000 UTC	0
Annotations
description localhost:8100 of job demo has been down for more than 5 minutes. summary Instance localhost:8100 down

Watchdog (1 active)

alert: Watchdog
expr: vector(1)
for: 10m
labels:
  severity: warning
annotations:
  description: This is an alert meant to ensure that the entire alerting pipeline
    is functional. This alert is always firing, therefore it should always be firing
    in Alertmanager and always fire against a receiver. There are integrations with
    various notification mechanisms that send a notification when this alert is not
    firing. For example the "DeadMansSnitch" integration in PagerDuty.
  summary: Ensure entire alerting pipeline is functional

Labels	State	Active Since	Value
alertname="Watchdog" severity="warning"	firing	2022-09-12 13:25:24.318691705 +0000 UTC	1
Annotations
description This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. summary Ensure entire alerting pipeline is functional

ClockSkewDetected (0 active)

alert: ClockSkewDetected
expr: abs(node_timex_offset_seconds)
  * 1000 > 30
for: 2m
labels:
  severity: warning
annotations:
  description: Clock skew detected on {{ $labels.instance }}. Ensure NTP is configured
    correctly on this host.
  summary: Instance {{ $labels.instance }} - Clock skew detected

CriticalCPULoad (0 active)

alert: CriticalCPULoad
expr: 100
  - (avg by(instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m]))
  * 100) > 96
for: 2m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load
    for more than 2 minutes.'
  summary: Instance {{ $labels.instance }} - Critical CPU load

CriticalDiskSpace (0 active)

alert: CriticalDiskSpace
expr: node_filesystem_free_bytes{fstype!~"(squashfs|fuse.*)",job="node",mountpoint!~"^/run(/.*|$)"}
  / node_filesystem_size_bytes{job="node"} < 0.1
for: 4m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has less than 10%
    space remaining.'
  summary: Instance {{ $labels.instance }} - Critical disk space usage

CriticalRAMUsage (0 active)

alert: CriticalRAMUsage
expr: (1
  - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)
  / node_memory_MemTotal_bytes)) * 100 > 98
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} has Critical Memory Usage more than 5 minutes.'
  summary: Instance {{ $labels.instance }} has Critical Memory Usage

RebootRequired (0 active)

alert: RebootRequired
expr: node_reboot_required
  > 0
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} requires a reboot.'
  summary: Instance {{ $labels.instance }} - reboot required