Skip to content
Snippets Groups Projects
Commit 9d624aa8 authored by Julius Volz's avatar Julius Volz Committed by Ben Kochie
Browse files

Prometheus metamonitoring rules

parent 07f7cc57
No related branches found
No related tags found
1 merge request!222Prometheus metamonitoring rules
Showing
with 490 additions and 0 deletions
ALERT PrometheusUnreachable
IF up{job=~"prometheus.*"} == 0
FOR 10m
LABELS {
service = "prometheus",
severity = "critical",
}
ANNOTATIONS {
title = "{{$labels.job}} is unreachable",
description = "{{$labels.job}} at {{$labels.instance}} could not be scraped for over 10 minutes.",
runbook = "troubleshooting/prometheus-is-down.md",
}
ALERT PrometheusManyRestarts
IF changes(process_start_time_seconds{job=~"prometheus.*"}[30m]) > 3
FOR 30m
LABELS {
service = "prometheus",
severity = "critical",
}
ANNOTATIONS {
title = "{{$labels.job}} is restarting frequently",
description = "{{$labels.job}} at {{$labels.instance}} has restarted more than 3 times in the last 30 minutes. It might be crashlooping.",
runbook = "troubleshooting/prometheus-is-down.md",
}
ALERT PrometheusManyFileSDReadErrors
IF
rate(prometheus_sd_file_read_errors_total{job=~"prometheus.*"}[5m])
/
rate(prometheus_sd_file_scan_duration_seconds_count{job=~"prometheus.*"}[5m])
* 100
> 5
FOR 10m
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} has many DNS-SD errors",
description = "{{$labels.job}} at {{$labels.instance}} has {{$value}}% of DNS-SD requests failing.",
runbook = "troubleshooting/prometheus-file-sd-errors.md",
}
ALERT PrometheusRuleEvaluationSlow
IF prometheus_evaluator_duration_seconds{quantile="0.9",job=~"prometheus.*"} > 60
FOR 10m
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} is evaluating rules too slowly",
description = "{{$labels.job}} at {{$labels.instance}} has a 90th percentile latency of {{$value}}s completing rule evaluation cycles.",
runbook = "troubleshooting/prometheus-slow-rule-eval.md",
}
ALERT PrometheusCheckpointingSlow
IF
avg_over_time(prometheus_local_storage_checkpoint_last_duration_seconds{job=~"prometheus.*"}[15m])
>
prometheus_local_storage_max_chunks_to_persist{job=~"prometheus.*"} / 5000 # Allow 200µs per chunk.
FOR 5m
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} is checkpointing too slowly",
description = "{{$labels.job}} at {{$labels.instance}} needs {{$value}}s on average for each checkpoint.",
runbook = "troubleshooting/prometheus-indexing-backlog.md",
}
ALERT PrometheusIndexingBacklog
IF
prometheus_local_storage_indexing_queue_length{job=~"prometheus.*"}
/
prometheus_local_storage_indexing_queue_capacity{job=~"prometheus.*"}
* 100
> 10
FOR 30m
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} is backlogging on the indexing queue",
description = "{{$labels.job}} at {{$labels.instance}} is backlogging on the indexing queue for more than 30m. Queue is currently {{$value | printf `%.0f`}}% full.",
runbook = "troubleshooting/prometheus-indexing-backlog.md",
}
ALERT PrometheusNotIngestingSamples
IF rate(prometheus_local_storage_ingested_samples_total{job=~"prometheus.*"}[5m]) == 0
FOR 5m
LABELS {
service = "prometheus",
severity = "critical",
}
ANNOTATIONS {
title = "{{$labels.job}} is not ingesting samples",
description = "{{$labels.job}} at {{$labels.instance}} has not ingested any samples in the last 10 minutes.",
runbook = "troubleshooting/prometheus-not-ingesting.md",
}
ALERT PrometheusPersistErrors
IF rate(prometheus_local_storage_persist_errors_total{job=~"prometheus.*"}[10m]) > 0
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} has persist errors",
description = "{{$labels.job}} at {{$labels.instance}} has encountered {{$value}} persist errors per second in the last 10 minutes.",
runbook = "troubleshooting/prometheus-persist-errors.md",
}
ALERT PrometheusNotificationsBacklog
IF prometheus_notifications_queue_length{job=~"prometheus.*"} > 0
FOR 10m
LABELS {
service = "prometheus",
severity = "critical",
}
ANNOTATIONS {
title = "{{$labels.job}} is backlogging on the notifications queue",
description = "{{$labels.job}} at {{$labels.instance}} is backlogging on the notifications queue. The queue has not been empty for 10 minutes. Current queue length: {{$value}}.",
runbook = "troubleshooting/prometheus-notifications-backlog.md",
}
ALERT PrometheusScrapingSlowly
# The match in the interval label excludes any intervals >= 1m.
IF prometheus_target_interval_length_seconds{quantile="0.9",interval!~".*m.*",job=~"prometheus.*"} > 2 * 60
FOR 10m
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} is scraping targets slowly",
description = "{{$labels.job}} at {{$labels.instance}} has a 90th percentile latency of {{$value}}s for scraping targets in the {{$labels.interval}} target pool.",
runbook = "troubleshooting/prometheus-slow-scrapes.md",
}
ALERT PrometheusStorageInconsistent
IF prometheus_local_storage_inconsistencies_total{job=~"prometheus.*"} > 0
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} has an inconsistent storage",
description = "{{$labels.job}} at {{$labels.instance}} has detected a storage inconsistency. A server restart is needed to initiate recovery.",
runbook = "troubleshooting/prometheus-storage-inconsistent.md",
}
ALERT PrometheusPersistencePressureTooHigh
IF
prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"} > 0.8
AND
predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m], 3600 * 24) > 1
FOR 30m
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} can not keep up persisting",
description = "{{$labels.job}} at {{$labels.instance}} is approaching critical persistence pressure. Throttled ingestion expected within the next 24h.",
runbook = "troubleshooting/prometheus-persistence-pressure-high.md",
}
ALERT PrometheusPersistencePressureTooHigh
IF
prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"} > 0.85
AND
predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m], 3600 * 2) > 1
FOR 30m
LABELS {
service = "prometheus",
severity = "critical",
}
ANNOTATIONS {
title = "{{$labels.job}} can not keep up persisting",
description = "{{$labels.job}} at {{$labels.instance}} is approaching critical persistence pressure. Throttled ingestion expected within the next 2h.",
runbook = "troubleshooting/prometheus-persistence-pressure-high.md",
}
ALERT PrometheusSeriesMaintenanceStalled
IF
prometheus_local_storage_memory_series{job=~"prometheus.*"}
/ on(job,instance)
rate(prometheus_local_storage_series_ops_total{type="maintenance_in_memory",job=~"prometheus.*"}[5m])
/ 3600
> 24
AND on(job,instance)
prometheus_local_storage_rushed_mode == 1
FOR 1h
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} is maintaining memory time series too slowly",
description = "{{$labels.job}} at {{$labels.instance}} is maintaining memory time series so slowly that it will take {{$value | printf `%.0f`}}h to complete a full cycle. This will lead to persistence falling behind.",
runbook = "troubleshooting/prometheus-slow-series-maintenance.md",
}
ALERT PrometheusInvalidConfigFile
IF prometheus_config_last_reload_successful{job=~"prometheus.*"} == 0
FOR 30m
LABELS {
service = "prometheus",
severity = "critical",
}
ANNOTATIONS {
title = "{{$labels.job}} has an invalid config",
description = "The configuration file for {{$labels.job}} at {{$labels.instance}} is invalid and was therefore not reloaded.",
runbook = "troubleshooting/prometheus-invalid-config.md",
}
ALERT PrometheusOutOfOrderSamplesDiscarded
IF increase(prometheus_local_storage_out_of_order_samples_total{job=~"prometheus.*"}[10m]) > 0
FOR 1h
LABELS {
service = "prometheus",
severity = "warn",
}
ANNOTATIONS {
title = "{{$labels.job}} is discarding out-of-order samples",
description = "{{$labels.job}} at {{$labels.instance}} has discarded {{$value}} out-of-order samples over the last hour.",
runbook = "troubleshooting/prometheus-out-of-order.md",
}
# Prometheus Checkpointing Slow
## Symptoms
Prometheus is taking a long time to checkpoint its unpersisted in-memory
state to its checkpoint file. Normal times should be <5 minutes, but
specifically not more than 200µs per unpersisted chunk.
## Possible checks
Check how the value of `prometheus_local_storage_checkpoint_duration_seconds`
developed over time. Perhaps it increased recently? Did the number of time
series increase recently, which could have led to more chunks in the checkpoint?
Graph `prometheus_local_storage_memory_series`.
In general, any kind of load can contribute to longer checkpoint times,
especially IO load.
## Resolution
Reduce the load on the Prometheus server by either reducing the number of
handled time series, the number of rules, rates of queries, or other causes
of load.
\ No newline at end of file
# Prometheus FileSD read errors
## Symptoms
The `rate(prometheus_sd_file_read_errors_total[5m])` expression is showing
a higher error rate and new targets are not picked up from the SD files.
## Possible checks
1. Login to the server and study the Prometheus logs.
1. Look for lines containing "Error reading file".
The specific error message should say why the file couldn't be read.
## Resolution
If the file couldn't be read because it is a malformed target file, fix the file.
If the file couldn't be read because there was a permissions error, fix the file permissions.
If the file couldn't be read because there was a disk I/O error, fix / move the machine.
\ No newline at end of file
# Prometheus Indexing Backlog
## Symptoms
Prometheus is taking a long time to index new time series, and thus newly
appearing series take a while to be queryable.
## Possible checks
Did the number of time series increase recently?
Graph `prometheus_local_storage_memory_series` to see.
How did the indexing queue develop? See `prometheus_local_storage_indexing_queue_length`.
Is it going down? Was it just one temporary spike of many new metrics?
In general, any kind of load can contribute to an indexing backlog,
but most of the times, this is caused by trying to handle too many time series
in one Prometheus server.
## Resolution
Reduce the load on the Prometheus server, especially the number of handled
or changing time series.
\ No newline at end of file
# Prometheus Invalid Configuration File
## Symptoms
Prometheus cannot read its configuration file and will thus keep on using a
previously loaded configuration. On restart, Prometheus will crash due to
not being able to load its config.
## Possible checks
Log in to the Prometheus server and check the logs to see what exact
configuration error is being reported.
## Resolution
Fix the reported configuration error.
\ No newline at end of file
# Prometheus Not Ingesting
## Symptoms
Prometheus is not ingesting any new samples, so new data points will not
appear in queries, and alerts will have no data to work on.
## Possible checks
To check whether this is a misconfiguration (no targets configured or
discovered), check the `/targets` page on the Prometheus server to verify
that there are discovered targets.
To check whether there is another problem, login to the machine and check
the Prometheus logs and general Prometheus health metrics.
## Resolution
Fix the targets misconfiguration or fix whatever other problem was indicated
in the logs.
\ No newline at end of file
# Prometheus Notifications Backlog
## Symptoms
Prometheus is having trouble working off its queue of notifications to send
to Alertmanager. Alert notifications may get delivered late or not at all.
## Possible checks
See how `prometheus_notifications_queue_length` developed
over time. Log in to the machine and check the Prometheus logs to see if
Prometheus is encountering any errors while sending alerts to Alertmanager.
Check that Alertmanager is reachable and not overloaded.
## Resolution
Depending on the above checks, either address the errors that are logged
by Prometheus or ensure that Alertmanager is healthy again.
\ No newline at end of file
# Prometheus Invalid Configuration File
## Symptoms
Prometheus is ingesting samples for the same series with duplicate timestamps,
but different values.
## Possible checks
Check whether there are any two targets that got relabeled into the same labelset.
Are there any targets that are explicitly using client-side timestamps in their
`/metrics` incorrectly?
## Resolution
Fix the erroneous relabling rules or the targets that produce wrongly timestamped
data.
\ No newline at end of file
# Prometheus Persist Errors
## Symptoms
Prometheus is encountering errors while persisting sample chunks.
## Possible checks
See how `rate(prometheus_local_storage_persist_errors_total[10m])` developed
over time. Log in to the machine and check the Prometheus logs to see the
exact error that is occurring. Most likely, the disk is full or there are
IO errors.
## Resolution
If the disk is full, either change the retention time, use a larger disk,
or put less time series on the Prometheus.
If there are other errors in the log, address those.
\ No newline at end of file
# Prometheus Persistence Pressure Too High
## Symptoms
Prometheus is approaching critical persistence pressure, meaning that it cannot
keep up with persisting the number of ingested samples. Eventually, ingestion
will be throttled as a result of this.
## Possible checks
Did the number of time series increase recently?
Graph `prometheus_local_storage_memory_series` to see.
In general, any kind of load can contribute to a persistence pressure,
but most of the times, this is caused by trying to handle too many time series
or a too high scrape frequency in one Prometheus server.
## Resolution
Reduce the load on the Prometheus server, especially the number of handled
or changing time series.
\ No newline at end of file
# Prometheus Rule Evaluation Slow
## Symptoms
Rule-based metrics are appearing with a lag or not at all anymore because
Prometheus's rule evaluator takes a long time to complete a cycle.
## Possible checks
Check how the expression `prometheus_evaluator_duration_seconds{quantile="0.9",job=~"prometheus.*"}`
developed over time. Did it recently increase by a lot? Perhaps the rule
evaluation got slower due to more time series. Check for a recent increase
in time series: `prometheus_local_storage_memory_series`.
Perhaps the Prometheus server is overloaded by other things or in general,
there might be too many expensive rules configured.
## Resolution
Reduce the load on the Prometheus server by either reducing the number of
handled time series, the number of rules, rates of queries, or other causes
of load.
\ No newline at end of file
# Prometheus Scraping Slowly
## Symptoms
Prometheus is scraping targets slowly. New metrics will appear slower than
desired by configuration or not at all.
## Possible checks
See how `prometheus_target_interval_length_seconds{quantile="0.9"}` developed
over time. Did `count(up)` increase recently to indicate a higher number of
targets? Did `prometheus_local_storage_memory_series` increase recently to
indicate an overall larger number of time series that are scraped? Are the
targets themselves responsive on their `/metrics` endpoint?
## Resolution
Depending on the above, either lower the load on Prometheus by reducing the
number of targets or time series, or ensure that your targets are quickly
scrapable.
\ No newline at end of file
# Prometheus Series Maintenance Stalled
## Symptoms
Prometheus is maintaining (persisting, archiving, truncating, purging, ...) memory
time series so slowly that it will take a too long time to complete a full cycle.
This will lead to persistence falling behind.
## Possible checks
Did the number of time series increase recently?
Graph `prometheus_local_storage_memory_series` to see. Also check the
rate of maintained series: `rate(prometheus_local_storage_series_ops_total{type="maintenance_in_memory"}[5m]`
In general, any kind of load can contribute to slow series maintenance,
but most of the times, this is caused by trying to handle too many time series
in one Prometheus server.
## Resolution
Reduce the load on the Prometheus server, especially the number of handled
or changing time series.
\ No newline at end of file
# Prometheus Storage Inconsistent
## Symptoms
Prometheus has encountered an inconsistency in its storage while reading/writing
from/to it. Some series may now be inaccessible or have problems.
## Possible checks
Log in to the Prometheus server and check the logs to see if there is any specific
error pointed out. Did the server crash recently?
## Resolution
Restart Prometheus gracefully to trigger a recovery run.
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment