Skip to content
Snippets Groups Projects
Commit 639b4857 authored by Ben Kochie's avatar Ben Kochie
Browse files

Adjust alerting thresholds

Adjust `FOR` interval on alerts to avoid scrape failure induced alert flapping.
parent 6a0d4095
No related branches found
No related tags found
1 merge request!220Adjust alerting thresholds
ALERT DBHeavyLoad
IF node_load1{instance=~"db.*"} > 200
FOR 1m
FOR 5m
LABELS {severity="critical", channel="infrastructure"}
ANNOTATIONS {
title="High load in database {{ $labels.fqdn }}: {{$value}}",
Loading
Loading
## 'git cat-file' processes
ALERT HighGitCatFileCount
IF sum without(fqdn, instance) (process_count{name="git cat-file"}) >= 40
FOR 1m
FOR 5m
LABELS {severity="warn", channel="infrastructure"}
ANNOTATIONS {
title="Number of 'git cat-file' processes is high - {{$value}}",
Loading
Loading
## gitlab.com
ALERT GitlabComDown
IF probe_http_status_code{instance="gitlab.com",job="blackbox"} != 200
FOR 1m
FOR 5m
LABELS {severity="critical", pager="pagerduty"}
ANNOTATIONS {
title="GitLab.com is down for 1 minute",
Loading
Loading
## KIBANA IS DOWN
ALERT KibanaDown
IF node_systemd_unit_state{name="kibana.service",state="active"} == 0
FOR 10s
FOR 5m
LABELS {severity="critical"}
ANNOTATIONS {
title="Kibana is down",
Loading
Loading
## Target is down
ALERT OtherPrometheusDown
IF up{job="prometheus-other-instance"} == 0
FOR 2m
FOR 5m
LABELS {severity="critical",pager="pagerduty"}
ANNOTATIONS {
title="Prometheus server is down!",
Loading
Loading
Loading
Loading
@@ -4,7 +4,7 @@ ALERT LowDiskSpace
/
node_filesystem_size{job="base-debian", fstype=~"(ext.|xfs)"}
* 100 <= 10
FOR 1m
FOR 15m
LABELS { severity = "critical" }
ANNOTATIONS {
title = "Really low disk space left on {{ $labels.mountpoint }} on {{ $labels.fqdn }}: {{ $value | printf \"%.2f\" }}%",
Loading
Loading
## gitlab-examples.gitlab.io
ALERT GitlabPagesDown
IF probe_http_status_code{instance="gitlab-examples.gitlab.io",job="blackbox"} != 200
FOR 1m
FOR 5m
LABELS {severity="critical", pager="pagerduty"}
ANNOTATIONS {
title="GitLab pages is down for 1 minute",
Loading
Loading
Loading
Loading
@@ -3,7 +3,7 @@ ALERT PostgreSQLReplicationStopped
# By floating point rules, NaN != NaN, so we can't just apply "== NaN" as a filter,
# so we use "x != x" instead to get all elements that are NaN.
IF pg_stat_replication_pg_xlog_location_diff{job="gitlab-cluster-db"} != pg_stat_replication_pg_xlog_location_diff{job="gitlab-cluster-db"}
FOR 1m
FOR 5m
LABELS {severity="critical", channel="infrastructure", pager="pagerduty"}
ANNOTATIONS {
title="PostgreSQL replication has stopped",
Loading
Loading
## registry.gitlab.com
ALERT RegistryDown
IF probe_http_status_code{instance="registry.gitlab.com", job="blackbox-ssl"} != 200
FOR 1m
FOR 5m
LABELS {severity="critical", pager="pagerduty"}
ANNOTATIONS {
title="GitLab Registry is down for 1 minute",
Loading
Loading
Loading
Loading
@@ -43,10 +43,10 @@ gitlab:runners_nginx_docker_registry_cache_status = gitlab:runners_cache_registr
 
ALERT RunnersNginxDockerRegistryCacheDown
IF gitlab:runners_nginx_docker_registry_cache_status != 1111
FOR 1m
FOR 5m
LABELS {severity="critical"}
ANNOTATIONS {
title='Runners cache services on {{ .Labels.hostname }}: {{ range printf "gitlab:runners_cache_registry_nginx_service{hostname=\'%s\'}" .Labels.hostname | query }} nginx {{ if eq (.Value | humanize) "1" }}up{{else}}down{{ end }}{{ end }}, {{ range printf "gitlab:runners_cache_registry_docker_service{hostname=\'%s\'}" .Labels.hostname | query }} docker {{ if eq (.Value | humanize) "1" }}up{{else}}down{{ end }}{{ end}}, {{ range printf "gitlab:runners_registry{hostname=\'%s\'}" .Labels.hostname | query }} registry {{ if eq (.Value | humanize) "1" }}up{{else}}down{{ end }}{{ end}}, {{ range printf "gitlab:runners_cache{hostname=\'%s\'}" .Labels.hostname | query }} minio {{ if eq (.Value | humanize) "1" }}up{{else}}down{{ end }}{{ end}}.',
runbook="troubleshooting/runners_cache_is_down.md",
description="This impacts CI execution builds, consider tweeting: !tweet 'CI executions are being delayed due to our runners cache being down at GitLab.com, we are investigating the root cause'"
}
\ No newline at end of file
}
## sentry.gitlap.com
ALERT SentryDown
IF probe_http_status_code{instance="sentry.gitlap.com", job="blackbox-ssl"} != 200
FOR 1m
FOR 5m
LABELS {severity="critical"}
ANNOTATIONS {
title="sentry.gitlap.com is down",
Loading
Loading
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment