forked from RemoteSync/grafana-meta-monitoring-chart
Compare commits
8 Commits
add_tempo_
...
add_retent
Author | SHA1 | Date | |
---|---|---|---|
|
18d24c39f7 | ||
|
23d14110a0 | ||
|
6fb22ae671 | ||
|
d3878e1516 | ||
|
8ae136e0c4 | ||
|
ac3e4462f9 | ||
|
c95c0e2ca9 | ||
|
c288a80bd4 |
53
charts/meta-monitoring/src/rules/loki-rules.yaml
Normal file
53
charts/meta-monitoring/src/rules/loki-rules.yaml
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
groups:
|
||||||
|
- name: loki_rules
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||||
|
route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
571
charts/meta-monitoring/src/rules/mimir-rules.yaml
Normal file
571
charts/meta-monitoring/src/rules/mimir-rules.yaml
Normal file
@@ -0,0 +1,571 @@
|
|||||||
|
groups:
|
||||||
|
- name: mimir_api_1
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:cortex_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_api_2
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||||
|
route)
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_api_3
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_querier_api
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
|
||||||
|
(cluster, job, route)
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, route)
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, route)
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, route)
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_cache
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job, method)
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
method)
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_storage
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_queries
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_query_frontend_retries:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_query_frontend_retries:50quantile
|
||||||
|
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_retries:avg
|
||||||
|
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
|
||||||
|
(cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
|
||||||
|
cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_ingester_queries
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_series:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_series:50quantile
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_series:avg
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples:50quantile
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples:avg
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars:50quantile
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
|
||||||
|
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars:avg
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate
|
||||||
|
- name: mimir_received_samples
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
||||||
|
- name: mimir_exemplars_in
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m
|
||||||
|
- name: mimir_received_exemplars
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m
|
||||||
|
- name: mimir_exemplars_ingested
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m
|
||||||
|
- name: mimir_exemplars_appended
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m
|
||||||
|
- name: mimir_scaling_rules
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
kube_deployment_spec_replicas,
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:actual_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by (cluster, namespace) (
|
||||||
|
cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
/ 240000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: distributor
|
||||||
|
reason: sample_rate
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||||
|
* 0.59999999999999998 / 240000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: distributor
|
||||||
|
reason: sample_rate_limits
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by (cluster, namespace) (
|
||||||
|
cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
* 3 / 80000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: ingester
|
||||||
|
reason: sample_rate
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by(cluster, namespace) (
|
||||||
|
cortex_ingester_memory_series
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
/ 1500000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: ingester
|
||||||
|
reason: active_series
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
|
||||||
|
* 3 * 0.59999999999999998 / 1500000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: ingester
|
||||||
|
reason: active_series_limits
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||||
|
* 0.59999999999999998 / 80000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: ingester
|
||||||
|
reason: sample_rate_limits
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
(sum by (cluster, namespace) (
|
||||||
|
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
|
||||||
|
) / 4)
|
||||||
|
/
|
||||||
|
avg by (cluster, namespace) (
|
||||||
|
memcached_limit_bytes{job=~".+/memcached"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: memcached
|
||||||
|
reason: active_series
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the CPU request for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
||||||
|
# that remove resource metrics, ref:
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
|
||||||
|
#
|
||||||
|
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
|
||||||
|
# where kube_pod_container_resource_requests_cpu_cores was removed:
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests_cpu_cores,
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
# This expression is compatible with kube-state-metrics >= v1.4.0,
|
||||||
|
# where kube_pod_container_resource_requests was introduced.
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests{resource="cpu"},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
|
- expr: |
|
||||||
|
# Jobs should be sized to their CPU usage.
|
||||||
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||||
|
# their current provisioned #replicas and resource requests.
|
||||||
|
ceil(
|
||||||
|
cluster_namespace_deployment:actual_replicas:count
|
||||||
|
*
|
||||||
|
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
|
||||||
|
/
|
||||||
|
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
reason: cpu_usage
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
container_memory_usage_bytes{image!=""},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:container_memory_usage_bytes:sum
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the Memory request for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
||||||
|
# that remove resource metrics, ref:
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
|
||||||
|
#
|
||||||
|
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
|
||||||
|
# where kube_pod_container_resource_requests_memory_bytes was removed:
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests_memory_bytes,
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
# This expression is compatible with kube-state-metrics >= v1.4.0,
|
||||||
|
# where kube_pod_container_resource_requests was introduced.
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests{resource="memory"},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
||||||
|
- expr: |
|
||||||
|
# Jobs should be sized to their Memory usage.
|
||||||
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||||
|
# their current provisioned #replicas and resource requests.
|
||||||
|
ceil(
|
||||||
|
cluster_namespace_deployment:actual_replicas:count
|
||||||
|
*
|
||||||
|
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
|
||||||
|
/
|
||||||
|
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
reason: memory_usage
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- name: mimir_alertmanager_rules
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job, pod) (cortex_alertmanager_alerts)
|
||||||
|
record: cluster_job_pod:cortex_alertmanager_alerts:sum
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job, pod) (cortex_alertmanager_silences)
|
||||||
|
record: cluster_job_pod:cortex_alertmanager_silences:sum
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
|
||||||
|
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
|
||||||
|
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
|
||||||
|
- name: mimir_ingester_rules
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
|
||||||
|
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
|
15
charts/meta-monitoring/src/rules/tempo-rules.yaml
Normal file
15
charts/meta-monitoring/src/rules/tempo-rules.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
groups:
|
||||||
|
- name: tempo_rules
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate
|
@@ -37,6 +37,8 @@ data:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Logs
|
||||||
|
|
||||||
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
||||||
loki.source.kubernetes "pods" {
|
loki.source.kubernetes "pods" {
|
||||||
targets = discovery.relabel.rename_meta_labels.output
|
targets = discovery.relabel.rename_meta_labels.output
|
||||||
@@ -58,6 +60,8 @@ data:
|
|||||||
{{- end }}
|
{{- end }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
// Metrics
|
||||||
|
|
||||||
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
||||||
prometheus.scrape "pods" {
|
prometheus.scrape "pods" {
|
||||||
targets = discovery.relabel.rename_meta_labels.output
|
targets = discovery.relabel.rename_meta_labels.output
|
||||||
@@ -70,8 +74,117 @@ data:
|
|||||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
// cAdvisor and Kubelete metrics
|
||||||
|
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
|
||||||
|
discovery.kubernetes "all_nodes" {
|
||||||
|
role = "node"
|
||||||
|
}
|
||||||
|
|
||||||
|
discovery.relabel "all_nodes" {
|
||||||
|
targets = discovery.kubernetes.all_nodes.targets
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_node_name"]
|
||||||
|
target_label = "node"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||||
|
separator = "/"
|
||||||
|
regex = "(.*)/(.*)/(.*)"
|
||||||
|
replacement = "${1}/${2}-${3}"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
target_label = "cluster"
|
||||||
|
replacement = "{{- .Values.clusterName -}}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.scrape "cadvisor" {
|
||||||
|
targets = discovery.relabel.all_nodes.output
|
||||||
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
|
|
||||||
|
scrape_interval = "15s"
|
||||||
|
metrics_path = "/metrics/cadvisor"
|
||||||
|
scheme = "https"
|
||||||
|
|
||||||
|
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||||
|
tls_config {
|
||||||
|
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.scrape "kubelet" {
|
||||||
|
targets = discovery.relabel.all_nodes.output
|
||||||
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
|
|
||||||
|
scrape_interval = "15s"
|
||||||
|
metrics_path = "/metrics"
|
||||||
|
scheme = "https"
|
||||||
|
|
||||||
|
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||||
|
tls_config {
|
||||||
|
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.exporter.unix {}
|
||||||
|
|
||||||
|
prometheus.scrape "node_exporter" {
|
||||||
|
targets = prometheus.exporter.unix.targets
|
||||||
|
forward_to = [prometheus.relabel.node_exporter.receiver]
|
||||||
|
|
||||||
|
job_name = "node-exporter"
|
||||||
|
scrape_interval = "15s"
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.relabel "node_exporter" {
|
||||||
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
|
|
||||||
|
rule {
|
||||||
|
replacement = env("HOSTNAME")
|
||||||
|
target_label = "nodename"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
replacement = "node-exporter"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_node_name"]
|
||||||
|
target_label = "node"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||||
|
separator = "/"
|
||||||
|
regex = "(.*)/(.*)/(.*)"
|
||||||
|
replacement = "${1}/${2}-${3}"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
target_label = "cluster"
|
||||||
|
replacement = "{{- .Values.clusterName -}}"
|
||||||
|
}
|
||||||
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
// Traces
|
||||||
|
|
||||||
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
||||||
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
||||||
otelcol.receiver.otlp "otlp_receiver" {
|
otelcol.receiver.otlp "otlp_receiver" {
|
||||||
|
126
charts/meta-monitoring/templates/ruler/ruler.yaml
Normal file
126
charts/meta-monitoring/templates/ruler/ruler.yaml
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: meta-mimir-ruler-for-dashboards
|
||||||
|
namespace: meta
|
||||||
|
spec:
|
||||||
|
progressDeadlineSeconds: 600
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 10
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: ruler-for-dashboards
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
strategy:
|
||||||
|
rollingUpdate:
|
||||||
|
maxSurge: 50%
|
||||||
|
maxUnavailable: 0
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: ruler-for-dashboards
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
namespace: meta
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- args:
|
||||||
|
- -target=ruler
|
||||||
|
- -log.level=debug
|
||||||
|
- -ruler-storage.backend=local
|
||||||
|
- -ruler-storage.local.directory=/etc/rules
|
||||||
|
- -ruler.ring.prefix=dashboards/
|
||||||
|
- -config.expand-env=true
|
||||||
|
- -config.file=/etc/mimir/mimir.yaml
|
||||||
|
image: grafana/mimir:2.8.0
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
name: ruler
|
||||||
|
ports:
|
||||||
|
- containerPort: 8080
|
||||||
|
name: http-metrics
|
||||||
|
protocol: TCP
|
||||||
|
- containerPort: 9095
|
||||||
|
name: grpc
|
||||||
|
protocol: TCP
|
||||||
|
- containerPort: 7946
|
||||||
|
name: memberlist
|
||||||
|
protocol: TCP
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: http-metrics
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 45
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 1
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 128Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
terminationMessagePath: /dev/termination-log
|
||||||
|
terminationMessagePolicy: File
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /etc/mimir
|
||||||
|
name: config
|
||||||
|
- mountPath: /var/mimir
|
||||||
|
name: runtime-config
|
||||||
|
- mountPath: /data
|
||||||
|
name: storage
|
||||||
|
- mountPath: /active-query-tracker
|
||||||
|
name: active-queries
|
||||||
|
- mountPath: /etc/rules/anonymous
|
||||||
|
name: rules
|
||||||
|
dnsPolicy: ClusterFirst
|
||||||
|
restartPolicy: Always
|
||||||
|
schedulerName: default-scheduler
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 10001
|
||||||
|
runAsGroup: 10001
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 10001
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
serviceAccount: meta-mimir
|
||||||
|
serviceAccountName: meta-mimir
|
||||||
|
terminationGracePeriodSeconds: 180
|
||||||
|
topologySpreadConstraints:
|
||||||
|
- labelSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: ruler
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
maxSkew: 1
|
||||||
|
topologyKey: kubernetes.io/hostname
|
||||||
|
whenUnsatisfiable: ScheduleAnyway
|
||||||
|
volumes:
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
items:
|
||||||
|
- key: mimir.yaml
|
||||||
|
path: mimir.yaml
|
||||||
|
name: meta-mimir-config
|
||||||
|
name: config
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
name: meta-mimir-runtime
|
||||||
|
name: runtime-config
|
||||||
|
- emptyDir: {}
|
||||||
|
name: storage
|
||||||
|
- emptyDir: {}
|
||||||
|
name: active-queries
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
name: rules
|
||||||
|
name: rules
|
||||||
|
{{- end }}
|
18
charts/meta-monitoring/templates/ruler/rules-configmap.yaml
Normal file
18
charts/meta-monitoring/templates/ruler/rules-configmap.yaml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: rules
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
data:
|
||||||
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
|
{{ ($.Files.Glob "src/rules/loki-rules.yaml").AsConfig | indent 2 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
|
{{ ($.Files.Glob "src/rules/mimir-rules.yaml").AsConfig | indent 2 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.traces.enabled }}
|
||||||
|
{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
@@ -71,13 +71,22 @@ loki:
|
|||||||
storage:
|
storage:
|
||||||
type: "s3"
|
type: "s3"
|
||||||
s3:
|
s3:
|
||||||
endpoint: "meta-minio.meta.svc:9000"
|
|
||||||
access_key_id: rootuser
|
|
||||||
secret_access_key: rootpassword
|
|
||||||
insecure: true
|
insecure: true
|
||||||
|
s3ForcePathStyle: true
|
||||||
bucketNames:
|
bucketNames:
|
||||||
chunks: loki-chunks
|
chunks: loki-chunks
|
||||||
ruler: loki-ruler
|
ruler: loki-ruler
|
||||||
|
structuredConfig:
|
||||||
|
common:
|
||||||
|
storage:
|
||||||
|
s3:
|
||||||
|
access_key_id: "{{ .Values.global.minio.rootUser }}"
|
||||||
|
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
||||||
|
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
||||||
|
compactor:
|
||||||
|
retention_enabled: true
|
||||||
|
limits_config:
|
||||||
|
retention_period: 24h
|
||||||
monitoring:
|
monitoring:
|
||||||
dashboards:
|
dashboards:
|
||||||
enabled: false
|
enabled: false
|
||||||
@@ -128,6 +137,8 @@ mimir-distributed:
|
|||||||
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
||||||
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
||||||
insecure: true
|
insecure: true
|
||||||
|
limits:
|
||||||
|
compactor_blocks_retention_period: 24h
|
||||||
|
|
||||||
tempo-distributed:
|
tempo-distributed:
|
||||||
tempo:
|
tempo:
|
||||||
@@ -141,6 +152,9 @@ tempo-distributed:
|
|||||||
access_key: "{{ .Values.global.minio.rootUser }}"
|
access_key: "{{ .Values.global.minio.rootUser }}"
|
||||||
secret_key: "{{ .Values.global.minio.rootPassword }}"
|
secret_key: "{{ .Values.global.minio.rootPassword }}"
|
||||||
insecure: true
|
insecure: true
|
||||||
|
compactor:
|
||||||
|
compaction:
|
||||||
|
block_retention: 24h
|
||||||
traces:
|
traces:
|
||||||
otlp:
|
otlp:
|
||||||
http:
|
http:
|
||||||
|
Reference in New Issue
Block a user