Alerts

Inactive (28) Pending (0) Firing (1)

/etc/config/alerting_rules.yml > alert.rules

fabricUsedDisk (2 active)

alert: fabricUsedDisk
expr: kubelet_volume_stats_used_bytes{persistentvolumeclaim=~"production-.*|database-*"} / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"production-.*|database-*"} * 100 > 60
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: fabricUsedDisk greater than 60% (instance {{ $labels.instance }})

Labels	State	Active Since	Value
alertname="fabricUsedDisk" alpha_eksctl_io_cluster_name="pre3" alpha_eksctl_io_instance_id="i-022c2d5862c5caa3c" alpha_eksctl_io_nodegroup_name="ng-7-workers-zone-1a-24" beta_kubernetes_io_arch="amd64" beta_kubernetes_io_instance_type="t3a.medium" beta_kubernetes_io_os="linux" failure_domain_beta_kubernetes_io_region="eu-west-1" failure_domain_beta_kubernetes_io_zone="eu-west-1a" instance="ip-192-168-38-223.eu-west-1.compute.internal" job="kubernetes-nodes" k8s_io_cloud_provider_aws="5c95b4e26f974af0109535d635c2eeb9" kubernetes_io_arch="amd64" kubernetes_io_hostname="ip-192-168-38-223.eu-west-1.compute.internal" kubernetes_io_os="linux" namespace="pre" node_kubernetes_io_instance_type="t3a.medium" node_lifecycle="on-demand" persistentvolumeclaim="production-peer1-0" role="workers-zone-1a" severity="medium" topology_ebs_csi_aws_com_zone="eu-west-1a" topology_kubernetes_io_region="eu-west-1" topology_kubernetes_io_zone="eu-west-1a"	firing	2024-09-25 15:31:52.114227227 +0000 UTC	71.31900544517393
Annotations
description VALUE = 71.31900544517393 LABELS: map[alpha_eksctl_io_cluster_name:pre3 alpha_eksctl_io_instance_id:i-022c2d5862c5caa3c alpha_eksctl_io_nodegroup_name:ng-7-workers-zone-1a-24 beta_kubernetes_io_arch:amd64 beta_kubernetes_io_instance_type:t3a.medium beta_kubernetes_io_os:linux failure_domain_beta_kubernetes_io_region:eu-west-1 failure_domain_beta_kubernetes_io_zone:eu-west-1a instance:ip-192-168-38-223.eu-west-1.compute.internal job:kubernetes-nodes k8s_io_cloud_provider_aws:5c95b4e26f974af0109535d635c2eeb9 kubernetes_io_arch:amd64 kubernetes_io_hostname:ip-192-168-38-223.eu-west-1.compute.internal kubernetes_io_os:linux namespace:pre node_kubernetes_io_instance_type:t3a.medium node_lifecycle:on-demand persistentvolumeclaim:production-peer1-0 role:workers-zone-1a topology_ebs_csi_aws_com_zone:eu-west-1a topology_kubernetes_io_region:eu-west-1 topology_kubernetes_io_zone:eu-west-1a] summary fabricUsedDisk greater than 60% (instance ip-192-168-38-223.eu-west-1.compute.internal)
alertname="fabricUsedDisk" alpha_eksctl_io_cluster_name="pre3" alpha_eksctl_io_instance_id="i-05a8c64819f48f4e9" alpha_eksctl_io_nodegroup_name="ng-7-workers-zone-1c-24" beta_kubernetes_io_arch="amd64" beta_kubernetes_io_instance_type="t3a.medium" beta_kubernetes_io_os="linux" failure_domain_beta_kubernetes_io_region="eu-west-1" failure_domain_beta_kubernetes_io_zone="eu-west-1c" instance="ip-192-168-87-210.eu-west-1.compute.internal" job="kubernetes-nodes" k8s_io_cloud_provider_aws="5c95b4e26f974af0109535d635c2eeb9" kubernetes_io_arch="amd64" kubernetes_io_hostname="ip-192-168-87-210.eu-west-1.compute.internal" kubernetes_io_os="linux" namespace="pre" node_kubernetes_io_instance_type="t3a.medium" node_lifecycle="on-demand" persistentvolumeclaim="production-peer2-0" role="workers-zone-1c" severity="medium" topology_ebs_csi_aws_com_zone="eu-west-1c" topology_kubernetes_io_region="eu-west-1" topology_kubernetes_io_zone="eu-west-1c"	firing	2024-09-18 11:31:52.114227227 +0000 UTC	71.31966046429065
Annotations
description VALUE = 71.31966046429065 LABELS: map[alpha_eksctl_io_cluster_name:pre3 alpha_eksctl_io_instance_id:i-05a8c64819f48f4e9 alpha_eksctl_io_nodegroup_name:ng-7-workers-zone-1c-24 beta_kubernetes_io_arch:amd64 beta_kubernetes_io_instance_type:t3a.medium beta_kubernetes_io_os:linux failure_domain_beta_kubernetes_io_region:eu-west-1 failure_domain_beta_kubernetes_io_zone:eu-west-1c instance:ip-192-168-87-210.eu-west-1.compute.internal job:kubernetes-nodes k8s_io_cloud_provider_aws:5c95b4e26f974af0109535d635c2eeb9 kubernetes_io_arch:amd64 kubernetes_io_hostname:ip-192-168-87-210.eu-west-1.compute.internal kubernetes_io_os:linux namespace:pre node_kubernetes_io_instance_type:t3a.medium node_lifecycle:on-demand persistentvolumeclaim:production-peer2-0 role:workers-zone-1c topology_ebs_csi_aws_com_zone:eu-west-1c topology_kubernetes_io_region:eu-west-1 topology_kubernetes_io_zone:eu-west-1c] summary fabricUsedDisk greater than 60% (instance ip-192-168-87-210.eu-west-1.compute.internal)

HostDiskWillFillIn48Hours (0 active)

alert: HostDiskWillFillIn48Hours
expr: predict_linear(node_filesystem_avail_bytes[1h], 48 * 3600) < 0
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Disk will fill in 48 hours at current write rate
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host disk will fill in 4 hours (instance {{ $labels.instance }})

HostHighCpuLoad (0 active)

alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 60
labels:
  severity: warning
annotations:
  description: |-
    CPU load is > 80%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host high CPU load (instance {{ $labels.instance }})

HostMemoryUnderMemoryPressure (0 active)

alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[2m]) > 8
labels:
  severity: warning
annotations:
  description: |-
    The node is under heavy memory pressure. High rate of major page faults
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host memory under memory pressure (instance {{ $labels.instance }})

HostOutOfDiskInodes (0 active)

alert: HostOutOfDiskInodes
expr: (node_filesystem_files_free / node_filesystem_files) * 100 < 50
labels:
  severity: warning
annotations:
  description: |-
    Disk inodes is almost full (< 50% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of disk inodes (instance {{ $labels.instance }})

HostOutOfDiskSpace (0 active)

alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 30
labels:
  severity: warning
annotations:
  description: |-
    Disk is almost full (< 30% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of disk space (instance {{ $labels.instance }})

HostOutOfMemory (0 active)

alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 30
labels:
  severity: warning
annotations:
  description: |-
    Node memory is filling up (< 30% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of memory (instance {{ $labels.instance }})

HostOutOfMemoryCritical (0 active)

alert: HostOutOfMemoryCritical
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 10
labels:
  severity: critical
annotations:
  description: |-
    Node memory is filling up (< 10% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of memory (instance {{ $labels.instance }})

HostUnusualNetworkThroughputIn (0 active)

alert: HostUnusualNetworkThroughputIn
expr: (sum by(instance) (irate(node_network_receive_bytes_total[2m]))) / (1024 * 1024) > 1
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Host network interfaces are probably receiving too much data (> 1 MB/s)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host unusual network throughput in (instance {{ $labels.instance }})

HostUnusualNetworkThroughputOut (0 active)

alert: HostUnusualNetworkThroughputOut
expr: (sum by(instance) (irate(node_network_transmit_bytes_total[2m]))) / (1024 * 1024) > 1
for: 2m
labels:
  severity: warning

InstanceDown (0 active)

alert: InstanceDown
expr: up == 0
for: 10s
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.'
  summary: Endpoint {{ $labels.instance }} down

PrometheusConfigurationReloadFailure (0 active)

alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus configuration reload error
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Prometheus configuration reload failure (instance {{ $labels.instance }})

PrometheusTooManyRestarts (0 active)

alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Prometheus too many restarts (instance {{ $labels.instance }})

RaftLeaderChanges (0 active)

alert: RaftLeaderChanges
expr: changes(consensus_etcdraft_is_leader[2h]) > 0
labels:
  severity: warning
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' raft leader changed (instance {{ $labels.instance }})'

chainCodeMemoryUsage (0 active)

alert: chainCodeMemoryUsage
expr: container_memory_working_set_bytes{name=~"pr.-peer..pr..gouze.io-.*"} / (1204 * 1024) > 25
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: chainCodeMemoryUsage greater than 25Mb (instance {{ $labels.instance }})

chaincodeExecuteTimeouts (0 active)

alert: chaincodeExecuteTimeouts
expr: rate(chaincode_execute_timeouts[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' chaincodeExecuteTimeouts per minute (instance {{ $labels.instance }})'

chaincodeLaunchFailures (0 active)

alert: chaincodeLaunchFailures
expr: rate(chaincode_launch_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' chaincodeLaunchFailures per minute (instance {{ $labels.instance }})'

chaincodeLaunchTimeouts (0 active)

alert: chaincodeLaunchTimeouts
expr: rate(chaincode_launch_timeouts[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' chaincodeLaunchFailures per minute (instance {{ $labels.instance }})'

containerStatusLastTerminatedReason (0 active)

alert: containerStatusLastTerminatedReason
expr: increase(kube_pod_container_status_last_terminated_reason[1d]) > 0
labels:
  severity: warning
annotations:
  description: |-
    Container VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Container {{ $labels.instance }} down

couchDbMemoryUsage (0 active)

alert: couchDbMemoryUsage
expr: container_memory_working_set_bytes{container="couchdb",pod=~"peer.*"} / (1204 * 1024) > 1200
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: couchDbMemoryUsage greater than 1200Mb (instance {{ $labels.instance }})

endorserChaincodeInstantiationFailures (0 active)

alert: endorserChaincodeInstantiationFailures
expr: rate(endorser_chaincode_instantiation_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorserChaincodeInstantiationFailures (instance {{ $labels.instance }})

endorserDuplicateTransactionFailures (0 active)

alert: endorserDuplicateTransactionFailures
expr: rate(endorser_duplicate_transaction_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorserDuplicateTransactionFailures (instance {{ $labels.instance }})

endorserEndorsementFailures (0 active)

alert: endorserEndorsementFailures
expr: rate(endorser_endorsement_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorser_endorsement_failures (instance {{ $labels.instance }})

endorserProposalAclFailures (0 active)

alert: endorserProposalAclFailures
expr: rate(endorser_proposal_acl_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorserProposalAclFailures (instance {{ $labels.instance }})

endorserProposalValidationFailures (0 active)

alert: endorserProposalValidationFailures
expr: rate(endorser_proposal_validation_failures[30s]) * 30 > 0
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: endorserProposalValidationFailures (instance {{ $labels.instance }})

ledgerTransactionCountPerMinuteCritical (0 active)

alert: ledgerTransactionCountPerMinuteCritical
expr: rate(ledger_transaction_count[1m]) * 60 > 60
labels:
  severity: critical
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' ledger_transaction_count more than 60 per minute (instance {{ $labels.instance }})'

ledgerTransactionCountPerMinuteWarning (0 active)

alert: ledgerTransactionCountPerMinuteWarning
expr: rate(ledger_transaction_count[1m]) * 60 > 16
labels:
  severity: warning
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ' ledger_transaction_count more than 16 per minute (instance {{ $labels.instance }})'

ordererMemoryUsage (0 active)

alert: ordererMemoryUsage
expr: container_memory_working_set_bytes{container=~"orderer."} / (1204 * 1024) > 100
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: ordererMemoryUsage greater than 100Mb (instance {{ $labels.instance }})

peerMemoryUsage (0 active)

alert: peerMemoryUsage
expr: container_memory_working_set_bytes{container="peer",pod=~"peer.*"} / (1204 * 1024) > 600
labels:
  severity: medium
annotations:
  description: |2-
     VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: peerMemoryUsage greater than 600Mb (instance {{ $labels.instance }})