Alerting Using Prometheus Rules
Below are example alerting rules for Prometheus.
These rules cover basic cluster health and operation.
- alert: 'MasterReadOnly'
expr: mysql_global_variables_read_only == 1 and on(hostname)(tungsten_manager_service{role="master"})
for: 2m
description: Database is read only on {{$labels.hostname}}, but the role is master.
- alert: 'TungstenReplicatorDown'
expr: up == 0 and {job=~"tungsten-exporters.*",instance=~".*8091"}
for: 10m
description: 'Tungsten replicator down or unreachable on {{$labels.hostname}}, please verify that the replicator is running and exporter is returning metrics'
- alert: 'TungstenManagerDown'
expr: up == 0 and {job=~"tungsten-exporters.*",instance=~".*8092"}
for: 10m
description: 'Tungsten manager down or unreachable on {{$labels.hostname}}, please verify that the manager is running and exporter is returning metrics'
- alert: 'TungstenConnectorDown'
expr: up == 0 and {job=~"tungsten-exporters.*",instance=~".*8093"}
for: 10m
description: 'Tungsten connector down or unreachable on {{$labels.hostname}}, please verify that the connector is running and exporter is returning metrics'
- alert: 'TungstenReplicatorOffline'
expr: tungsten_replicator_service{state!="online"}
for: 10m
description: 'Tungsten replicator not online on {{$labels.hostname}}, please investigate. (shell> trepctl status)'
- alert: 'TungstenManagerOffline'
expr: tungsten_manager_service{state!="online"}
for: 10m
description: 'Tungsten manager not online on {{$labels.hostname}}, please investigate. (shell> trepctl status) and (shell> echo ls | cctrl)'
- alert: 'TungstenManagerMaintenance'
expr: tungsten_manager_policy{policy!="AUTOMATIC"}
for: 6h
description: 'Tungsten manager policy not AUTOMATIC on {{$labels.hostname}}, please check if cluster is still under maintenance.'
- alert: 'TungstenTwoReplicatorMasters'
expr: sum by (vip)(tungsten_replicator_service{role="master",state="online"})!= 1
for: 10m
description: 'Tungsten - multiple replicator masters (shell> trepctl services) for {{$labels.vip}}, cannot serve two masters. Please investigate immediately.'
- alert: 'TungstenHeapSpaceUsage'
expr: jvm_memory_bytes_used{area="heap"}/jvm_memory_bytes_max{area="heap"}*100 > 90
for: 20m
description: 'Tungsten - heap space more than 90% full for more than 20 minutes on {{$labels.instance}}. (look at tmsvc.log)'
- alert: 'TungstenReplicaStale'
expr: tungsten_replicator_latency{latency="relative"} > 3600
for: 10m
description: 'Tungsten - no updates on replica {{$labels.hostname}} for 60 minutes. Check if replicas are behind or if there is no DB activity to replicate'
- alert: 'TungstenReplicaNoProgress'
expr: rate(tungsten_replicator_seqno{seqno="current"}[10m]) == 0
for: 10m
description: 'Tungsten - no updates on replica {{$labels.hostname}} for over 10 minutes. Check if replicas are behind or if there is no DB activity to replicate'
- alert: 'TungstenZeroDatasourceMasters'
expr: (sum by (vip)(tungsten_manager_service{role="master"}) < 1) and (sum by (vip)(tungsten_manager_service{role="relay"}) < 1)
for: 10m
description: 'Tungsten - zero datasource masters/relays (shell> echo ls | cctrl) for {{$labels.vip}}, cannot function with no datasource masters. Please investigate immediately.'
- alert: 'TungstenMasterRoleNotConsistent'
expr: (sum by (hostname)(tungsten_manager_service{role="master"}) == sum by (hostname)(tungsten_replicator_service{role="master"})) != 1
for: 10m
description: 'Tungsten role=master not consistent between manager and replicator on {{$labels.hostname}}.'
- alert: 'TungstenSlaveRoleNotConsistent'
expr: (sum by (hostname)(tungsten_manager_service{role="slave"}) == sum by (hostname)(tungsten_replicator_service{role="slave"})) != 1
for: 10m
description: 'Tungsten role=slave not consistent between manager and replicator on {{$labels.hostname}}.'
- alert: 'TungstenRelayRoleNotConsistent'
expr: (sum by (hostname)(tungsten_manager_service{role="relay"}) == sum by (hostname)(tungsten_replicator_service{role="relay"})) != 1
for: 10m
description: 'Tungsten role=relay not consistent between manager and replicator on {{$labels.hostname}}.'