Alerts


/etc/alerts.d/node_alerting_rules.yml > container_cpu_usage_is_high
POD_CPU_IS_HIGH (1 active)
alert: POD_CPU_IS_HIGH
expr: sum by(container, pod, namespace) (rate(container_cpu_usage_seconds_total{container!=""}[5m])) * 100 > 90
for: 1m
labels:
  severity: critical
annotations:
  description: Container {{ $labels.container }} CPU usage inside POD {{ $labels.pod}} is high in {{ $labels.namespace}}
  summary: POD {{ $labels.pod}} CPU Usage is high in {{ $labels.namespace}}
Labels State Active Since Value
alertname="POD_CPU_IS_HIGH" container="alpha" namespace="ssd-june" pod="dgraph-0" severity="critical" firing 2025-12-06 11:01:39.87619941 +0000 UTC 415.44198955904835
/etc/alerts.d/node_alerting_rules.yml > container_memory_usage_is_high
POD_MEMORY_USAGE_IS_HIGH (2 active)
alert: POD_MEMORY_USAGE_IS_HIGH
expr: (sum by(container, pod, namespace) (container_memory_working_set_bytes{container!=""}) / sum by(container, pod, namespace) (container_spec_memory_limit_bytes > 0) * 100) > 80
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Container Memory usage is above 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Container {{ $labels.container }} Memory usage inside POD {{ $labels.pod}} is high in {{ $labels.namespace}}
Labels State Active Since Value
alertname="POD_MEMORY_USAGE_IS_HIGH" container="minio" namespace="redica-sep" pod="ssd-minio-6d9ddbcc9b-lnc6d" severity="critical" firing 2025-12-01 18:14:29.308708883 +0000 UTC 90.24861653645834
alertname="POD_MEMORY_USAGE_IS_HIGH" container="minio" namespace="ssd-june" pod="ssd-minio-6d9ddbcc9b-46lw8" severity="critical" firing 2025-11-02 11:11:29.308708883 +0000 UTC 96.99605305989584
/etc/alerts.d/node_alerting_rules.yml > node_cpu_greater_than_80
NODE_CPU_IS_HIGH (0 active)
alert: NODE_CPU_IS_HIGH
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 1m
labels:
  severity: critical
annotations:
  description: node {{ $labels.kubernetes_node }} cpu is high
  summary: node cpu is greater than 80 precent
/etc/alerts.d/node_alerting_rules.yml > node_disk_space_too_low
NODE_DISK_SPACE_IS_LOW (0 active)
alert: NODE_DISK_SPACE_IS_LOW
expr: (100 * ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"}) / (node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}))) < 10
for: 1m
labels:
  severity: critical
annotations:
  description: node {{ $labels.node }} disk space is only {{ printf "%0.2f" $value }}% free.
  summary: node disk space remaining is less than 10 percent
/etc/alerts.d/node_alerting_rules.yml > node_down
NODE_DOWN (0 active)
alert: NODE_DOWN
expr: up{component="kubernetes-nodes"} == 0
for: 3m
labels:
  severity: warning
annotations:
  description: '{{ $labels.job }} job failed to scrape instance {{ $labels.instance }} for more than 3 minutes. Node Seems to be down'
  summary: Node {{ $labels.kubernetes_node }} is down
/etc/alerts.d/node_alerting_rules.yml > node_memory_left_lessser_than_10
NODE_MEMORY_LESS_THAN_10% (0 active)
alert: NODE_MEMORY_LESS_THAN_10%
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 1m
labels:
  severity: critical
annotations:
  description: node {{ $labels.kubernetes_node }} memory left is low
  summary: node memory left is lesser than 10 precent
/etc/alerts.d/node_alerting_rules.yml > prometheus-job-down
prometheus-job-down (0 active)
alert: prometheus-job-down
expr: up{job="prometheus"} == 0
for: 1m
labels:
  severity: warning
annotations:
  description: Default Prometheus Job is Down LABELS = {{ $labels }}
  summary: The Default Prometheus Job is Down (job {{ $labels.job}})