Prometheus metamonitoring rules

9d624aa8 · Julius Volz · Ben Kochie · 07f7cc57 · 9d624aa8 · 9d624aa8
Commit 9d624aa8 authored 7 years ago by Julius Volz Committed by Ben Kochie 7 years ago
--- a/alerts/prometheus-metamon.rules
+++ b/alerts/prometheus-metamon.rules
+ALERT PrometheusUnreachable
+  IF up{job=~"prometheus.*"} == 0
+  FOR 10m
+  LABELS {
+    service = "prometheus",
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is unreachable",
+    description = "{{$labels.job}} at {{$labels.instance}} could not be scraped for over 10 minutes.",
+    runbook = "troubleshooting/prometheus-is-down.md",
+  }
+
+ALERT PrometheusManyRestarts
+  IF changes(process_start_time_seconds{job=~"prometheus.*"}[30m]) > 3
+  FOR 30m
+  LABELS {
+    service = "prometheus",
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is restarting frequently",
+    description = "{{$labels.job}} at {{$labels.instance}} has restarted more than 3 times in the last 30 minutes. It might be crashlooping.",
+    runbook = "troubleshooting/prometheus-is-down.md",
+  }
+
+ALERT PrometheusManyFileSDReadErrors
+  IF
+      rate(prometheus_sd_file_read_errors_total{job=~"prometheus.*"}[5m])
+    /
+      rate(prometheus_sd_file_scan_duration_seconds_count{job=~"prometheus.*"}[5m])
+    * 100
+    > 5
+  FOR 10m
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} has many DNS-SD errors",
+    description = "{{$labels.job}} at {{$labels.instance}} has {{$value}}% of DNS-SD requests failing.",
+    runbook = "troubleshooting/prometheus-file-sd-errors.md",
+  }
+
+ALERT PrometheusRuleEvaluationSlow
+  IF prometheus_evaluator_duration_seconds{quantile="0.9",job=~"prometheus.*"} > 60
+  FOR 10m
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is evaluating rules too slowly",
+    description = "{{$labels.job}} at {{$labels.instance}} has a 90th percentile latency of {{$value}}s completing rule evaluation cycles.",
+    runbook = "troubleshooting/prometheus-slow-rule-eval.md",
+  }
+
+ALERT PrometheusCheckpointingSlow
+  IF
+      avg_over_time(prometheus_local_storage_checkpoint_last_duration_seconds{job=~"prometheus.*"}[15m])
+    >
+      prometheus_local_storage_max_chunks_to_persist{job=~"prometheus.*"} / 5000  # Allow 200µs per chunk.
+  FOR 5m
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is checkpointing too slowly",
+    description = "{{$labels.job}} at {{$labels.instance}} needs {{$value}}s on average for each checkpoint.",
+    runbook = "troubleshooting/prometheus-indexing-backlog.md",
+  }
+
+ALERT PrometheusIndexingBacklog
+  IF
+      prometheus_local_storage_indexing_queue_length{job=~"prometheus.*"}
+    /
+      prometheus_local_storage_indexing_queue_capacity{job=~"prometheus.*"}
+    * 100
+    > 10
+  FOR 30m
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is backlogging on the indexing queue",
+    description = "{{$labels.job}} at {{$labels.instance}} is backlogging on the indexing queue for more than 30m. Queue is currently {{$value | printf `%.0f`}}% full.",
+    runbook = "troubleshooting/prometheus-indexing-backlog.md",
+  }
+
+ALERT PrometheusNotIngestingSamples
+  IF rate(prometheus_local_storage_ingested_samples_total{job=~"prometheus.*"}[5m]) == 0
+  FOR 5m
+  LABELS {
+    service = "prometheus",
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is not ingesting samples",
+    description = "{{$labels.job}} at {{$labels.instance}} has not ingested any samples in the last 10 minutes.",
+    runbook = "troubleshooting/prometheus-not-ingesting.md",
+  }
+
+ALERT PrometheusPersistErrors
+  IF rate(prometheus_local_storage_persist_errors_total{job=~"prometheus.*"}[10m]) > 0
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} has persist errors",
+    description = "{{$labels.job}} at {{$labels.instance}} has encountered {{$value}} persist errors per second in the last 10 minutes.",
+    runbook = "troubleshooting/prometheus-persist-errors.md",
+  }
+
+ALERT PrometheusNotificationsBacklog
+  IF prometheus_notifications_queue_length{job=~"prometheus.*"} > 0
+  FOR 10m
+  LABELS {
+    service = "prometheus",
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is backlogging on the notifications queue",
+    description = "{{$labels.job}} at {{$labels.instance}} is backlogging on the notifications queue. The queue has not been empty for 10 minutes. Current queue length: {{$value}}.",
+    runbook = "troubleshooting/prometheus-notifications-backlog.md",
+  }
+
+ALERT PrometheusScrapingSlowly
+  # The match in the interval label excludes any intervals >= 1m.
+  IF prometheus_target_interval_length_seconds{quantile="0.9",interval!~".*m.*",job=~"prometheus.*"} > 2 * 60
+  FOR 10m
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is scraping targets slowly",
+    description = "{{$labels.job}} at {{$labels.instance}} has a 90th percentile latency of {{$value}}s for scraping targets in the {{$labels.interval}} target pool.",
+    runbook = "troubleshooting/prometheus-slow-scrapes.md",
+  }
+
+ALERT PrometheusStorageInconsistent
+  IF prometheus_local_storage_inconsistencies_total{job=~"prometheus.*"} > 0
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} has an inconsistent storage",
+    description = "{{$labels.job}} at {{$labels.instance}} has detected a storage inconsistency. A server restart is needed to initiate recovery.",
+    runbook = "troubleshooting/prometheus-storage-inconsistent.md",
+  }
+
+ALERT PrometheusPersistencePressureTooHigh
+  IF
+      prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"} > 0.8
+    AND
+      predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m], 3600 * 24) > 1
+  FOR 30m
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} can not keep up persisting",
+    description = "{{$labels.job}} at {{$labels.instance}} is approaching critical persistence pressure. Throttled ingestion expected within the next 24h.",
+    runbook = "troubleshooting/prometheus-persistence-pressure-high.md",
+  }
+
+ALERT PrometheusPersistencePressureTooHigh
+  IF
+      prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"} > 0.85
+    AND
+      predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m], 3600 * 2) > 1
+  FOR 30m
+  LABELS {
+    service = "prometheus",
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} can not keep up persisting",
+    description = "{{$labels.job}} at {{$labels.instance}} is approaching critical persistence pressure. Throttled ingestion expected within the next 2h.",
+    runbook = "troubleshooting/prometheus-persistence-pressure-high.md",
+  }
+
+ALERT PrometheusSeriesMaintenanceStalled
+  IF
+        prometheus_local_storage_memory_series{job=~"prometheus.*"}
+      / on(job,instance)
+        rate(prometheus_local_storage_series_ops_total{type="maintenance_in_memory",job=~"prometheus.*"}[5m])
+      / 3600
+      > 24
+    AND on(job,instance)
+      prometheus_local_storage_rushed_mode == 1
+
+  FOR 1h
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is maintaining memory time series too slowly",
+    description = "{{$labels.job}} at {{$labels.instance}} is maintaining memory time series so slowly that it will take {{$value | printf `%.0f`}}h to complete a full cycle. This will lead to persistence falling behind.",
+    runbook = "troubleshooting/prometheus-slow-series-maintenance.md",
+  }
+
+ALERT PrometheusInvalidConfigFile
+  IF prometheus_config_last_reload_successful{job=~"prometheus.*"} == 0
+  FOR 30m
+  LABELS {
+    service = "prometheus",
+    severity = "critical",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} has an invalid config",
+    description = "The configuration file for {{$labels.job}} at {{$labels.instance}} is invalid and was therefore not reloaded.",
+    runbook = "troubleshooting/prometheus-invalid-config.md",
+  }
+
+ALERT PrometheusOutOfOrderSamplesDiscarded
+  IF increase(prometheus_local_storage_out_of_order_samples_total{job=~"prometheus.*"}[10m]) > 0
+  FOR 1h
+  LABELS {
+    service = "prometheus",
+    severity = "warn",
+  }
+  ANNOTATIONS {
+    title = "{{$labels.job}} is discarding out-of-order samples",
+    description = "{{$labels.job}} at {{$labels.instance}} has discarded {{$value}} out-of-order samples over the last hour.",
+    runbook = "troubleshooting/prometheus-out-of-order.md",
+  }
--- a/troubleshooting/prometheus-checkpointing-slow.md
+++ b/troubleshooting/prometheus-checkpointing-slow.md
+# Prometheus Checkpointing Slow
+
+## Symptoms
+
+Prometheus is taking a long time to checkpoint its unpersisted in-memory
+state to its checkpoint file. Normal times should be <5 minutes, but
+specifically not more than 200µs per unpersisted chunk.
+
+## Possible checks
+
+Check how the value of `prometheus_local_storage_checkpoint_duration_seconds`
+developed over time. Perhaps it increased recently? Did the number of time
+series increase recently, which could have led to more chunks in the checkpoint?
+Graph `prometheus_local_storage_memory_series`.
+
+In general, any kind of load can contribute to longer checkpoint times,
+especially IO load.
+
+## Resolution
+
+Reduce the load on the Prometheus server by either reducing the number of
+handled time series, the number of rules, rates of queries, or other causes
+of load.
\ No newline at end of file
--- a/troubleshooting/prometheus-filesd-errors.md
+++ b/troubleshooting/prometheus-filesd-errors.md
+# Prometheus FileSD read errors
+
+## Symptoms
+
+The `rate(prometheus_sd_file_read_errors_total[5m])` expression is showing
+a higher error rate and new targets are not picked up from the SD files.
+
+## Possible checks
+
+1. Login to the server and study the Prometheus logs.
+1. Look for lines containing "Error reading file".
+
+The specific error message should say why the file couldn't be read.
+
+## Resolution
+
+If the file couldn't be read because it is a malformed target file, fix the file.
+
+If the file couldn't be read because there was a permissions error, fix the file permissions.
+
+If the file couldn't be read because there was a disk I/O error, fix / move the machine.
\ No newline at end of file
--- a/troubleshooting/prometheus-indexing-backlog.md
+++ b/troubleshooting/prometheus-indexing-backlog.md
+# Prometheus Indexing Backlog
+
+## Symptoms
+
+Prometheus is taking a long time to index new time series, and thus newly
+appearing series take a while to be queryable.
+
+## Possible checks
+
+Did the number of time series increase recently?
+Graph `prometheus_local_storage_memory_series` to see.
+
+How did the indexing queue develop? See `prometheus_local_storage_indexing_queue_length`.
+Is it going down? Was it just one temporary spike of many new metrics?
+
+In general, any kind of load can contribute to an indexing backlog,
+but most of the times, this is caused by trying to handle too many time series
+in one Prometheus server.
+
+## Resolution
+
+Reduce the load on the Prometheus server, especially the number of handled
+or changing time series.
\ No newline at end of file
--- a/troubleshooting/prometheus-invalid-config.md
+++ b/troubleshooting/prometheus-invalid-config.md
+# Prometheus Invalid Configuration File
+
+## Symptoms
+
+Prometheus cannot read its configuration file and will thus keep on using a
+previously loaded configuration. On restart, Prometheus will crash due to
+not being able to load its config.
+
+## Possible checks
+
+Log in to the Prometheus server and check the logs to see what exact
+configuration error is being reported.
+
+## Resolution
+
+Fix the reported configuration error.
\ No newline at end of file
--- a/troubleshooting/prometheus-not-ingesting.md
+++ b/troubleshooting/prometheus-not-ingesting.md
+# Prometheus Not Ingesting
+
+## Symptoms
+
+Prometheus is not ingesting any new samples, so new data points will not
+appear in queries, and alerts will have no data to work on.
+
+## Possible checks
+
+To check whether this is a misconfiguration (no targets configured or
+discovered), check the `/targets` page on the Prometheus server to verify
+that there are discovered targets.
+
+To check whether there is another problem, login to the machine and check
+the Prometheus logs and general Prometheus health metrics.
+
+## Resolution
+
+Fix the targets misconfiguration or fix whatever other problem was indicated
+in the logs.
\ No newline at end of file
--- a/troubleshooting/prometheus-notifications-backlog.md
+++ b/troubleshooting/prometheus-notifications-backlog.md
+# Prometheus Notifications Backlog
+
+## Symptoms
+
+Prometheus is having trouble working off its queue of notifications to send
+to Alertmanager. Alert notifications may get delivered late or not at all.
+
+## Possible checks
+
+See how `prometheus_notifications_queue_length` developed
+over time. Log in to the machine and check the Prometheus logs to see if
+Prometheus is encountering any errors while sending alerts to Alertmanager.
+Check that Alertmanager is reachable and not overloaded.
+
+## Resolution
+
+Depending on the above checks, either address the errors that are logged
+by Prometheus or ensure that Alertmanager is healthy again.
\ No newline at end of file
--- a/troubleshooting/prometheus-out-or-order.md
+++ b/troubleshooting/prometheus-out-or-order.md
+# Prometheus Invalid Configuration File
+
+## Symptoms
+
+Prometheus is ingesting samples for the same series with duplicate timestamps,
+but different values.
+
+## Possible checks
+
+Check whether there are any two targets that got relabeled into the same labelset.
+Are there any targets that are explicitly using client-side timestamps in their
+`/metrics` incorrectly?
+
+## Resolution
+
+Fix the erroneous relabling rules or the targets that produce wrongly timestamped
+data.
\ No newline at end of file
--- a/troubleshooting/prometheus-persist-errors.md
+++ b/troubleshooting/prometheus-persist-errors.md
+# Prometheus Persist Errors
+
+## Symptoms
+
+Prometheus is encountering errors while persisting sample chunks.
+
+## Possible checks
+
+See how `rate(prometheus_local_storage_persist_errors_total[10m])` developed
+over time. Log in to the machine and check the Prometheus logs to see the
+exact error that is occurring. Most likely, the disk is full or there are
+IO errors.
+
+## Resolution
+
+If the disk is full, either change the retention time, use a larger disk,
+or put less time series on the Prometheus.
+
+If there are other errors in the log, address those.
\ No newline at end of file
--- a/troubleshooting/prometheus-persistence-pressure-high.md
+++ b/troubleshooting/prometheus-persistence-pressure-high.md
+# Prometheus Persistence Pressure Too High
+
+## Symptoms
+
+Prometheus is approaching critical persistence pressure, meaning that it cannot
+keep up with persisting the number of ingested samples. Eventually, ingestion
+will be throttled as a result of this.
+
+## Possible checks
+
+Did the number of time series increase recently?
+Graph `prometheus_local_storage_memory_series` to see.
+
+In general, any kind of load can contribute to a persistence pressure,
+but most of the times, this is caused by trying to handle too many time series
+or a too high scrape frequency in one Prometheus server.
+
+## Resolution
+
+Reduce the load on the Prometheus server, especially the number of handled
+or changing time series.
\ No newline at end of file
--- a/troubleshooting/prometheus-slow-rule-eval.md
+++ b/troubleshooting/prometheus-slow-rule-eval.md
+# Prometheus Rule Evaluation Slow
+
+## Symptoms
+
+Rule-based metrics are appearing with a lag or not at all anymore because
+Prometheus's rule evaluator takes a long time to complete a cycle.
+
+## Possible checks
+
+Check how the expression `prometheus_evaluator_duration_seconds{quantile="0.9",job=~"prometheus.*"}`
+developed over time. Did it recently increase by a lot? Perhaps the rule
+evaluation got slower due to more time series. Check for a recent increase
+in time series: `prometheus_local_storage_memory_series`.
+
+Perhaps the Prometheus server is overloaded by other things or in general,
+there might be too many expensive rules configured.
+
+## Resolution
+
+Reduce the load on the Prometheus server by either reducing the number of
+handled time series, the number of rules, rates of queries, or other causes
+of load.
\ No newline at end of file
--- a/troubleshooting/prometheus-slow-scrapes.md
+++ b/troubleshooting/prometheus-slow-scrapes.md
+# Prometheus Scraping Slowly
+
+## Symptoms
+
+Prometheus is scraping targets slowly. New metrics will appear slower than
+desired by configuration or not at all.
+
+## Possible checks
+
+See how `prometheus_target_interval_length_seconds{quantile="0.9"}` developed
+over time. Did `count(up)` increase recently to indicate a higher number of
+targets? Did `prometheus_local_storage_memory_series` increase recently to
+indicate an overall larger number of time series that are scraped? Are the
+targets themselves responsive on their `/metrics` endpoint?
+
+## Resolution
+
+Depending on the above, either lower the load on Prometheus by reducing the
+number of targets or time series, or ensure that your targets are quickly
+scrapable.
\ No newline at end of file
--- a/troubleshooting/prometheus-slow-series-maintenance.md
+++ b/troubleshooting/prometheus-slow-series-maintenance.md
+# Prometheus Series Maintenance Stalled
+
+## Symptoms
+
+Prometheus is maintaining (persisting, archiving, truncating, purging, ...) memory
+time series so slowly that it will take a too long time to complete a full cycle.
+This will lead to persistence falling behind.
+
+## Possible checks
+
+Did the number of time series increase recently?
+Graph `prometheus_local_storage_memory_series` to see. Also check the
+rate of maintained series: `rate(prometheus_local_storage_series_ops_total{type="maintenance_in_memory"}[5m]`
+
+In general, any kind of load can contribute to slow series maintenance,
+but most of the times, this is caused by trying to handle too many time series
+in one Prometheus server.
+
+## Resolution
+
+Reduce the load on the Prometheus server, especially the number of handled
+or changing time series.
\ No newline at end of file
--- a/troubleshooting/prometheus-storage-inconsistent.md
+++ b/troubleshooting/prometheus-storage-inconsistent.md
+# Prometheus Storage Inconsistent
+
+## Symptoms
+
+Prometheus has encountered an inconsistency in its storage while reading/writing
+from/to it. Some series may now be inaccessible or have problems.
+
+## Possible checks
+
+Log in to the Prometheus server and check the logs to see if there is any specific
+error pointed out. Did the server crash recently?
+
+## Resolution
+
+Restart Prometheus gracefully to trigger a recovery run.
\ No newline at end of file