/etc/config/alerting_rules.yml > alert.rules
|
Labels |
State |
Active Since |
Value |
alertname="fabricUsedDisk"
alpha_eksctl_io_cluster_name="pre3"
alpha_eksctl_io_instance_id="i-022c2d5862c5caa3c"
alpha_eksctl_io_nodegroup_name="ng-7-workers-zone-1a-24"
beta_kubernetes_io_arch="amd64"
beta_kubernetes_io_instance_type="t3a.medium"
beta_kubernetes_io_os="linux"
failure_domain_beta_kubernetes_io_region="eu-west-1"
failure_domain_beta_kubernetes_io_zone="eu-west-1a"
instance="ip-192-168-38-223.eu-west-1.compute.internal"
job="kubernetes-nodes"
k8s_io_cloud_provider_aws="5c95b4e26f974af0109535d635c2eeb9"
kubernetes_io_arch="amd64"
kubernetes_io_hostname="ip-192-168-38-223.eu-west-1.compute.internal"
kubernetes_io_os="linux"
namespace="pre"
node_kubernetes_io_instance_type="t3a.medium"
node_lifecycle="on-demand"
persistentvolumeclaim="production-peer1-0"
role="workers-zone-1a"
severity="medium"
topology_ebs_csi_aws_com_zone="eu-west-1a"
topology_kubernetes_io_region="eu-west-1"
topology_kubernetes_io_zone="eu-west-1a"
|
firing |
2024-09-25 15:31:52.114227227 +0000 UTC |
71.31900544517393 |
Annotations |
- description
- VALUE = 71.31900544517393
LABELS: map[alpha_eksctl_io_cluster_name:pre3 alpha_eksctl_io_instance_id:i-022c2d5862c5caa3c alpha_eksctl_io_nodegroup_name:ng-7-workers-zone-1a-24 beta_kubernetes_io_arch:amd64 beta_kubernetes_io_instance_type:t3a.medium beta_kubernetes_io_os:linux failure_domain_beta_kubernetes_io_region:eu-west-1 failure_domain_beta_kubernetes_io_zone:eu-west-1a instance:ip-192-168-38-223.eu-west-1.compute.internal job:kubernetes-nodes k8s_io_cloud_provider_aws:5c95b4e26f974af0109535d635c2eeb9 kubernetes_io_arch:amd64 kubernetes_io_hostname:ip-192-168-38-223.eu-west-1.compute.internal kubernetes_io_os:linux namespace:pre node_kubernetes_io_instance_type:t3a.medium node_lifecycle:on-demand persistentvolumeclaim:production-peer1-0 role:workers-zone-1a topology_ebs_csi_aws_com_zone:eu-west-1a topology_kubernetes_io_region:eu-west-1 topology_kubernetes_io_zone:eu-west-1a]
- summary
- fabricUsedDisk greater than 60% (instance ip-192-168-38-223.eu-west-1.compute.internal)
|
alertname="fabricUsedDisk"
alpha_eksctl_io_cluster_name="pre3"
alpha_eksctl_io_instance_id="i-05a8c64819f48f4e9"
alpha_eksctl_io_nodegroup_name="ng-7-workers-zone-1c-24"
beta_kubernetes_io_arch="amd64"
beta_kubernetes_io_instance_type="t3a.medium"
beta_kubernetes_io_os="linux"
failure_domain_beta_kubernetes_io_region="eu-west-1"
failure_domain_beta_kubernetes_io_zone="eu-west-1c"
instance="ip-192-168-87-210.eu-west-1.compute.internal"
job="kubernetes-nodes"
k8s_io_cloud_provider_aws="5c95b4e26f974af0109535d635c2eeb9"
kubernetes_io_arch="amd64"
kubernetes_io_hostname="ip-192-168-87-210.eu-west-1.compute.internal"
kubernetes_io_os="linux"
namespace="pre"
node_kubernetes_io_instance_type="t3a.medium"
node_lifecycle="on-demand"
persistentvolumeclaim="production-peer2-0"
role="workers-zone-1c"
severity="medium"
topology_ebs_csi_aws_com_zone="eu-west-1c"
topology_kubernetes_io_region="eu-west-1"
topology_kubernetes_io_zone="eu-west-1c"
|
firing |
2024-09-18 11:31:52.114227227 +0000 UTC |
71.31966046429065 |
Annotations |
- description
- VALUE = 71.31966046429065
LABELS: map[alpha_eksctl_io_cluster_name:pre3 alpha_eksctl_io_instance_id:i-05a8c64819f48f4e9 alpha_eksctl_io_nodegroup_name:ng-7-workers-zone-1c-24 beta_kubernetes_io_arch:amd64 beta_kubernetes_io_instance_type:t3a.medium beta_kubernetes_io_os:linux failure_domain_beta_kubernetes_io_region:eu-west-1 failure_domain_beta_kubernetes_io_zone:eu-west-1c instance:ip-192-168-87-210.eu-west-1.compute.internal job:kubernetes-nodes k8s_io_cloud_provider_aws:5c95b4e26f974af0109535d635c2eeb9 kubernetes_io_arch:amd64 kubernetes_io_hostname:ip-192-168-87-210.eu-west-1.compute.internal kubernetes_io_os:linux namespace:pre node_kubernetes_io_instance_type:t3a.medium node_lifecycle:on-demand persistentvolumeclaim:production-peer2-0 role:workers-zone-1c topology_ebs_csi_aws_com_zone:eu-west-1c topology_kubernetes_io_region:eu-west-1 topology_kubernetes_io_zone:eu-west-1c]
- summary
- fabricUsedDisk greater than 60% (instance ip-192-168-87-210.eu-west-1.compute.internal)
|
|
|
|
alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[2m]) > 8
labels:
severity: warning
annotations:
description: |-
The node is under heavy memory pressure. High rate of major page faults
VALUE = {{ $value }}
LABELS: {{ $labels }}
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
|
|
|
|
|
|
alert: InstanceDown
expr: up == 0
for: 10s
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.'
summary: Endpoint {{ $labels.instance }} down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|