Alerts


/etc/config/alerting_rules.yml > alert.rules
fabricUsedDisk (2 active)
alert: fabricUsedDisk
expr: kubelet_volume_stats_used_bytes{persistentvolumeclaim=~"production-.*|database-*"} / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"production-.*|database-*"} * 100 > 60
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: fabricUsedDisk greater than 60% (instance {{ $labels.instance }})
Labels State Active Since Value
alertname="fabricUsedDisk" alpha_eksctl_io_cluster_name="pre3" alpha_eksctl_io_instance_id="i-022c2d5862c5caa3c" alpha_eksctl_io_nodegroup_name="ng-7-workers-zone-1a-24" beta_kubernetes_io_arch="amd64" beta_kubernetes_io_instance_type="t3a.medium" beta_kubernetes_io_os="linux" failure_domain_beta_kubernetes_io_region="eu-west-1" failure_domain_beta_kubernetes_io_zone="eu-west-1a" instance="ip-192-168-38-223.eu-west-1.compute.internal" job="kubernetes-nodes" k8s_io_cloud_provider_aws="5c95b4e26f974af0109535d635c2eeb9" kubernetes_io_arch="amd64" kubernetes_io_hostname="ip-192-168-38-223.eu-west-1.compute.internal" kubernetes_io_os="linux" namespace="pre" node_kubernetes_io_instance_type="t3a.medium" node_lifecycle="on-demand" persistentvolumeclaim="production-peer1-0" role="workers-zone-1a" severity="medium" topology_ebs_csi_aws_com_zone="eu-west-1a" topology_kubernetes_io_region="eu-west-1" topology_kubernetes_io_zone="eu-west-1a" firing 2024-09-25 15:31:52.114227227 +0000 UTC 71.31900544517393
alertname="fabricUsedDisk" alpha_eksctl_io_cluster_name="pre3" alpha_eksctl_io_instance_id="i-05a8c64819f48f4e9" alpha_eksctl_io_nodegroup_name="ng-7-workers-zone-1c-24" beta_kubernetes_io_arch="amd64" beta_kubernetes_io_instance_type="t3a.medium" beta_kubernetes_io_os="linux" failure_domain_beta_kubernetes_io_region="eu-west-1" failure_domain_beta_kubernetes_io_zone="eu-west-1c" instance="ip-192-168-87-210.eu-west-1.compute.internal" job="kubernetes-nodes" k8s_io_cloud_provider_aws="5c95b4e26f974af0109535d635c2eeb9" kubernetes_io_arch="amd64" kubernetes_io_hostname="ip-192-168-87-210.eu-west-1.compute.internal" kubernetes_io_os="linux" namespace="pre" node_kubernetes_io_instance_type="t3a.medium" node_lifecycle="on-demand" persistentvolumeclaim="production-peer2-0" role="workers-zone-1c" severity="medium" topology_ebs_csi_aws_com_zone="eu-west-1c" topology_kubernetes_io_region="eu-west-1" topology_kubernetes_io_zone="eu-west-1c" firing 2024-09-18 11:31:52.114227227 +0000 UTC 71.31966046429065
HostDiskWillFillIn48Hours (0 active)
alert: HostDiskWillFillIn48Hours
expr: predict_linear(node_filesystem_avail_bytes[1h], 48 * 3600) < 0
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Disk will fill in 48 hours at current write rate
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host disk will fill in 4 hours (instance {{ $labels.instance }})
HostHighCpuLoad (0 active)
alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 60
labels:
  severity: warning
annotations:
  description: |-
    CPU load is > 80%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host high CPU load (instance {{ $labels.instance }})
HostMemoryUnderMemoryPressure (0 active)
alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[2m]) > 8
labels:
  severity: warning
annotations:
  description: |-
    The node is under heavy memory pressure. High rate of major page faults
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host memory under memory pressure (instance {{ $labels.instance }})
HostOutOfDiskInodes (0 active)
alert: HostOutOfDiskInodes
expr: (node_filesystem_files_free / node_filesystem_files) * 100 < 50
labels:
  severity: warning
annotations:
  description: |-
    Disk inodes is almost full (< 50% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of disk inodes (instance {{ $labels.instance }})
HostOutOfDiskSpace (0 active)
alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 30
labels:
  severity: warning
annotations:
  description: |-
    Disk is almost full (< 30% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of disk space (instance {{ $labels.instance }})
HostOutOfMemory (0 active)
alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 30
labels:
  severity: warning
annotations:
  description: |-
    Node memory is filling up (< 30% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of memory (instance {{ $labels.instance }})
HostOutOfMemoryCritical (0 active)
alert: HostOutOfMemoryCritical
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 10
labels:
  severity: critical
annotations:
  description: |-
    Node memory is filling up (< 10% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of memory (instance {{ $labels.instance }})
HostUnusualNetworkThroughputIn (0 active)
alert: HostUnusualNetworkThroughputIn
expr: (sum by(instance) (irate(node_network_receive_bytes_total[2m]))) / (1024 * 1024) > 1
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Host network interfaces are probably receiving too much data (> 1 MB/s)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host unusual network throughput in (instance {{ $labels.instance }})
HostUnusualNetworkThroughputOut (0 active)
InstanceDown (0 active)
alert: InstanceDown
expr: up == 0
for: 10s
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.'
  summary: Endpoint {{ $labels.instance }} down
PrometheusConfigurationReloadFailure (0 active)
alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus configuration reload error
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
PrometheusTooManyRestarts (0 active)
alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Prometheus too many restarts (instance {{ $labels.instance }})
RaftLeaderChanges (0 active)
alert: RaftLeaderChanges
expr: changes(consensus_etcdraft_is_leader[2h]) > 0
labels:
  severity: warning
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' raft leader changed (instance {{ $labels.instance }})'
chainCodeMemoryUsage (0 active)
alert: chainCodeMemoryUsage
expr: container_memory_working_set_bytes{name=~"pr.-peer..pr..gouze.io-.*"} / (1204 * 1024) > 25
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: chainCodeMemoryUsage greater than 25Mb (instance {{ $labels.instance }})
chaincodeExecuteTimeouts (0 active)
alert: chaincodeExecuteTimeouts
expr: rate(chaincode_execute_timeouts[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' chaincodeExecuteTimeouts per minute (instance {{ $labels.instance }})'
chaincodeLaunchFailures (0 active)
alert: chaincodeLaunchFailures
expr: rate(chaincode_launch_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' chaincodeLaunchFailures per minute (instance {{ $labels.instance }})'
chaincodeLaunchTimeouts (0 active)
alert: chaincodeLaunchTimeouts
expr: rate(chaincode_launch_timeouts[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' chaincodeLaunchFailures per minute (instance {{ $labels.instance }})'
containerStatusLastTerminatedReason (0 active)
alert: containerStatusLastTerminatedReason
expr: increase(kube_pod_container_status_last_terminated_reason[1d]) > 0
labels:
  severity: warning
annotations:
  description: |-
    Container VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Container {{ $labels.instance }} down
couchDbMemoryUsage (0 active)
alert: couchDbMemoryUsage
expr: container_memory_working_set_bytes{container="couchdb",pod=~"peer.*"} / (1204 * 1024) > 1200
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: couchDbMemoryUsage greater than 1200Mb (instance {{ $labels.instance }})
endorserChaincodeInstantiationFailures (0 active)
alert: endorserChaincodeInstantiationFailures
expr: rate(endorser_chaincode_instantiation_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorserChaincodeInstantiationFailures (instance {{ $labels.instance }})
endorserDuplicateTransactionFailures (0 active)
alert: endorserDuplicateTransactionFailures
expr: rate(endorser_duplicate_transaction_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorserDuplicateTransactionFailures (instance {{ $labels.instance }})
endorserEndorsementFailures (0 active)
alert: endorserEndorsementFailures
expr: rate(endorser_endorsement_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorser_endorsement_failures (instance {{ $labels.instance }})
endorserProposalAclFailures (0 active)
alert: endorserProposalAclFailures
expr: rate(endorser_proposal_acl_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorserProposalAclFailures (instance {{ $labels.instance }})
endorserProposalValidationFailures (0 active)
alert: endorserProposalValidationFailures
expr: rate(endorser_proposal_validation_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorserProposalValidationFailures (instance {{ $labels.instance }})
ledgerTransactionCountPerMinuteCritical (0 active)
alert: ledgerTransactionCountPerMinuteCritical
expr: rate(ledger_transaction_count[1m]) * 60 > 60
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' ledger_transaction_count more than 60 per minute (instance {{ $labels.instance }})'
ledgerTransactionCountPerMinuteWarning (0 active)
alert: ledgerTransactionCountPerMinuteWarning
expr: rate(ledger_transaction_count[1m]) * 60 > 16
labels:
  severity: warning
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' ledger_transaction_count more than 16 per minute (instance {{ $labels.instance }})'
ordererMemoryUsage (0 active)
alert: ordererMemoryUsage
expr: container_memory_working_set_bytes{container=~"orderer."} / (1204 * 1024) > 100
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ordererMemoryUsage greater than 100Mb (instance {{ $labels.instance }})
peerMemoryUsage (0 active)
alert: peerMemoryUsage
expr: container_memory_working_set_bytes{container="peer",pod=~"peer.*"} / (1204 * 1024) > 600
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: peerMemoryUsage greater than 600Mb (instance {{ $labels.instance }})