|
|
@ -1,322 +1,322 @@
|
|
|
|
groups:
|
|
|
|
groups:
|
|
|
|
- name: mimir_api_1
|
|
|
|
- name: "mimir_api_1"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_job:cortex_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_job:cortex_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
|
|
|
|
by (cluster, job)
|
|
|
|
by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_request_duration_seconds:avg
|
|
|
|
record: "cluster_job:cortex_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)"
|
|
|
|
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_request_duration_seconds_count:sum_rate"
|
|
|
|
- name: mimir_api_2
|
|
|
|
- name: "mimir_api_2"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job, route))
|
|
|
|
by (le, cluster, job, route))"
|
|
|
|
record: cluster_job_route:cortex_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_job_route:cortex_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job, route))
|
|
|
|
by (le, cluster, job, route))"
|
|
|
|
record: cluster_job_route:cortex_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_job_route:cortex_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
|
|
|
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
|
|
|
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)"
|
|
|
|
record: cluster_job_route:cortex_request_duration_seconds:avg
|
|
|
|
record: "cluster_job_route:cortex_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
|
|
|
route)
|
|
|
|
route)"
|
|
|
|
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)"
|
|
|
|
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)"
|
|
|
|
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate"
|
|
|
|
- name: mimir_api_3
|
|
|
|
- name: "mimir_api_3"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, namespace, job, route))
|
|
|
|
by (le, cluster, namespace, job, route))"
|
|
|
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, namespace, job, route))
|
|
|
|
by (le, cluster, namespace, job, route))"
|
|
|
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
|
|
|
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
namespace, job, route)
|
|
|
|
namespace, job, route)"
|
|
|
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
|
|
|
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
|
|
|
job, route)
|
|
|
|
job, route)"
|
|
|
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
|
|
|
job, route)
|
|
|
|
job, route)"
|
|
|
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
|
|
|
|
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
|
|
|
|
job, route)
|
|
|
|
job, route)"
|
|
|
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate"
|
|
|
|
- name: mimir_querier_api
|
|
|
|
- name: "mimir_querier_api"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_querier_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_job:cortex_querier_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_querier_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_job:cortex_querier_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_querier_request_duration_seconds:avg
|
|
|
|
record: "cluster_job:cortex_querier_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate"
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job, route))
|
|
|
|
by (le, cluster, job, route))"
|
|
|
|
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job, route))
|
|
|
|
by (le, cluster, job, route))"
|
|
|
|
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
|
|
|
|
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
|
|
|
|
(cluster, job, route)
|
|
|
|
(cluster, job, route)"
|
|
|
|
record: cluster_job_route:cortex_querier_request_duration_seconds:avg
|
|
|
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
job, route)
|
|
|
|
job, route)"
|
|
|
|
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
job, route)
|
|
|
|
job, route)"
|
|
|
|
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job, route)
|
|
|
|
job, route)"
|
|
|
|
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, namespace, job, route))
|
|
|
|
by (le, cluster, namespace, job, route))"
|
|
|
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, namespace, job, route))
|
|
|
|
by (le, cluster, namespace, job, route))"
|
|
|
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
|
|
|
|
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
|
|
|
|
by (cluster, namespace, job, route)
|
|
|
|
by (cluster, namespace, job, route)"
|
|
|
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg
|
|
|
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
namespace, job, route)
|
|
|
|
namespace, job, route)"
|
|
|
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
namespace, job, route)
|
|
|
|
namespace, job, route)"
|
|
|
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
namespace, job, route)
|
|
|
|
namespace, job, route)"
|
|
|
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
|
|
|
|
- name: mimir_cache
|
|
|
|
- name: "mimir_cache"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job, method))
|
|
|
|
by (le, cluster, job, method))"
|
|
|
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job, method))
|
|
|
|
by (le, cluster, job, method))"
|
|
|
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
|
|
|
|
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
|
|
|
|
by (cluster, job, method)
|
|
|
|
by (cluster, job, method)"
|
|
|
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
|
|
|
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
job, method)
|
|
|
|
job, method)"
|
|
|
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
job, method)
|
|
|
|
job, method)"
|
|
|
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job, method)
|
|
|
|
job, method)"
|
|
|
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate"
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_job:cortex_cache_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_job:cortex_cache_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
|
|
|
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
|
|
|
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_cache_request_duration_seconds:avg
|
|
|
|
record: "cluster_job:cortex_cache_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate"
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job, method))
|
|
|
|
by (le, cluster, job, method))"
|
|
|
|
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job, method))
|
|
|
|
by (le, cluster, job, method))"
|
|
|
|
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
|
|
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
|
|
|
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job, method)
|
|
|
|
job, method)"
|
|
|
|
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
|
|
|
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
job, method)
|
|
|
|
job, method)"
|
|
|
|
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
|
|
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
|
|
|
method)
|
|
|
|
method)"
|
|
|
|
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job, method)
|
|
|
|
job, method)"
|
|
|
|
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate"
|
|
|
|
- name: mimir_storage
|
|
|
|
- name: "mimir_storage"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
|
|
|
|
record: "cluster_job:cortex_kv_request_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
|
|
|
|
record: "cluster_job:cortex_kv_request_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
|
|
|
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
|
|
|
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_kv_request_duration_seconds:avg
|
|
|
|
record: "cluster_job:cortex_kv_request_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate"
|
|
|
|
- name: mimir_queries
|
|
|
|
- name: "mimir_queries"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_query_frontend_retries:99quantile
|
|
|
|
record: "cluster_job:cortex_query_frontend_retries:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_query_frontend_retries:50quantile
|
|
|
|
record: "cluster_job:cortex_query_frontend_retries:50quantile"
|
|
|
|
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
|
|
|
|
- expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
|
|
|
|
by (cluster, job)
|
|
|
|
by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_query_frontend_retries:avg
|
|
|
|
record: "cluster_job:cortex_query_frontend_retries:avg"
|
|
|
|
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)"
|
|
|
|
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_query_frontend_retries_count:sum_rate"
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
|
|
|
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
|
|
|
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile"
|
|
|
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
|
|
|
|
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
|
|
|
|
(cluster, job)
|
|
|
|
(cluster, job)"
|
|
|
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
|
|
|
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg"
|
|
|
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
|
|
|
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
|
|
|
|
cluster, job)
|
|
|
|
cluster, job)"
|
|
|
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
|
|
|
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate"
|
|
|
|
- name: mimir_ingester_queries
|
|
|
|
- name: "mimir_ingester_queries"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_ingester_queried_series:99quantile
|
|
|
|
record: "cluster_job:cortex_ingester_queried_series:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_ingester_queried_series:50quantile
|
|
|
|
record: "cluster_job:cortex_ingester_queried_series:50quantile"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
|
|
|
|
by (cluster, job)
|
|
|
|
by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_series:avg
|
|
|
|
record: "cluster_job:cortex_ingester_queried_series:avg"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_series_count:sum_rate"
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_ingester_queried_samples:99quantile
|
|
|
|
record: "cluster_job:cortex_ingester_queried_samples:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_ingester_queried_samples:50quantile
|
|
|
|
record: "cluster_job:cortex_ingester_queried_samples:50quantile"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
|
|
|
|
by (cluster, job)
|
|
|
|
by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_samples:avg
|
|
|
|
record: "cluster_job:cortex_ingester_queried_samples:avg"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate"
|
|
|
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_ingester_queried_exemplars:99quantile
|
|
|
|
record: "cluster_job:cortex_ingester_queried_exemplars:99quantile"
|
|
|
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
|
|
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
|
|
|
by (le, cluster, job))
|
|
|
|
by (le, cluster, job))"
|
|
|
|
record: cluster_job:cortex_ingester_queried_exemplars:50quantile
|
|
|
|
record: "cluster_job:cortex_ingester_queried_exemplars:50quantile"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
|
|
|
|
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
|
|
|
|
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_exemplars:avg
|
|
|
|
record: "cluster_job:cortex_ingester_queried_exemplars:avg"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
|
|
|
|
job)
|
|
|
|
job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate"
|
|
|
|
- expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
|
|
|
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)"
|
|
|
|
record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate
|
|
|
|
record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate"
|
|
|
|
- name: mimir_received_samples
|
|
|
|
- name: "mimir_received_samples"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))"
|
|
|
|
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
|
|
|
record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m"
|
|
|
|
- name: mimir_exemplars_in
|
|
|
|
- name: "mimir_exemplars_in"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))"
|
|
|
|
record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m
|
|
|
|
record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m"
|
|
|
|
- name: mimir_received_exemplars
|
|
|
|
- name: "mimir_received_exemplars"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))"
|
|
|
|
record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m
|
|
|
|
record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m"
|
|
|
|
- name: mimir_exemplars_ingested
|
|
|
|
- name: "mimir_exemplars_ingested"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))"
|
|
|
|
record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m
|
|
|
|
record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m"
|
|
|
|
- name: mimir_exemplars_appended
|
|
|
|
- name: "mimir_exemplars_appended"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))
|
|
|
|
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))"
|
|
|
|
record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m
|
|
|
|
record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m"
|
|
|
|
- name: mimir_scaling_rules
|
|
|
|
- name: "mimir_scaling_rules"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
|
|
|
|
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
|
|
|
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
|
|
|
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
kube_deployment_spec_replicas,
|
|
|
|
kube_deployment_spec_replicas,
|
|
|
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
|
|
|
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
|
|
|
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
or
|
|
|
|
or
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
|
|
|
|
label_replace(kube_statefulset_replicas, \"deployment\", \"$1\", \"statefulset\", \"(.*?)(?:-zone-[a-z])?\")
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
record: cluster_namespace_deployment:actual_replicas:count
|
|
|
|
record: "cluster_namespace_deployment:actual_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
ceil(
|
|
|
|
ceil(
|
|
|
|
quantile_over_time(0.99,
|
|
|
|
quantile_over_time(0.99,
|
|
|
|
sum by (cluster, namespace) (
|
|
|
|
sum by (cluster, namespace) (
|
|
|
@ -324,21 +324,21 @@ groups:
|
|
|
|
)[24h:]
|
|
|
|
)[24h:]
|
|
|
|
)
|
|
|
|
)
|
|
|
|
/ 240000
|
|
|
|
/ 240000
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
deployment: distributor
|
|
|
|
deployment: "distributor"
|
|
|
|
reason: sample_rate
|
|
|
|
reason: "sample_rate"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
ceil(
|
|
|
|
ceil(
|
|
|
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
|
|
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"})
|
|
|
|
* 0.59999999999999998 / 240000
|
|
|
|
* 0.59999999999999998 / 240000
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
deployment: distributor
|
|
|
|
deployment: "distributor"
|
|
|
|
reason: sample_rate_limits
|
|
|
|
reason: "sample_rate_limits"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
ceil(
|
|
|
|
ceil(
|
|
|
|
quantile_over_time(0.99,
|
|
|
|
quantile_over_time(0.99,
|
|
|
|
sum by (cluster, namespace) (
|
|
|
|
sum by (cluster, namespace) (
|
|
|
@ -346,12 +346,12 @@ groups:
|
|
|
|
)[24h:]
|
|
|
|
)[24h:]
|
|
|
|
)
|
|
|
|
)
|
|
|
|
* 3 / 80000
|
|
|
|
* 3 / 80000
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
deployment: ingester
|
|
|
|
deployment: "ingester"
|
|
|
|
reason: sample_rate
|
|
|
|
reason: "sample_rate"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
ceil(
|
|
|
|
ceil(
|
|
|
|
quantile_over_time(0.99,
|
|
|
|
quantile_over_time(0.99,
|
|
|
|
sum by(cluster, namespace) (
|
|
|
|
sum by(cluster, namespace) (
|
|
|
@ -359,59 +359,59 @@ groups:
|
|
|
|
)[24h:]
|
|
|
|
)[24h:]
|
|
|
|
)
|
|
|
|
)
|
|
|
|
/ 1500000
|
|
|
|
/ 1500000
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
deployment: ingester
|
|
|
|
deployment: "ingester"
|
|
|
|
reason: active_series
|
|
|
|
reason: "active_series"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
ceil(
|
|
|
|
ceil(
|
|
|
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
|
|
|
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"max_global_series_per_user\"})
|
|
|
|
* 3 * 0.59999999999999998 / 1500000
|
|
|
|
* 3 * 0.59999999999999998 / 1500000
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
deployment: ingester
|
|
|
|
deployment: "ingester"
|
|
|
|
reason: active_series_limits
|
|
|
|
reason: "active_series_limits"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
ceil(
|
|
|
|
ceil(
|
|
|
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
|
|
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"})
|
|
|
|
* 0.59999999999999998 / 80000
|
|
|
|
* 0.59999999999999998 / 80000
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
deployment: ingester
|
|
|
|
deployment: "ingester"
|
|
|
|
reason: sample_rate_limits
|
|
|
|
reason: "sample_rate_limits"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
ceil(
|
|
|
|
ceil(
|
|
|
|
(sum by (cluster, namespace) (
|
|
|
|
(sum by (cluster, namespace) (
|
|
|
|
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
|
|
|
|
cortex_ingester_tsdb_storage_blocks_bytes{job=~\".+/ingester.*\"}
|
|
|
|
) / 4)
|
|
|
|
) / 4)
|
|
|
|
/
|
|
|
|
/
|
|
|
|
avg by (cluster, namespace) (
|
|
|
|
avg by (cluster, namespace) (
|
|
|
|
memcached_limit_bytes{job=~".+/memcached"}
|
|
|
|
memcached_limit_bytes{job=~\".+/memcached\"}
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
deployment: memcached
|
|
|
|
deployment: "memcached"
|
|
|
|
reason: active_series
|
|
|
|
reason: "active_series"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
|
|
|
|
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
|
|
|
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
|
|
|
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
|
|
|
),
|
|
|
|
),
|
|
|
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
|
|
|
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
|
|
|
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate
|
|
|
|
record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
# Convenience rule to get the CPU request for both a deployment and a statefulset.
|
|
|
|
# Convenience rule to get the CPU request for both a deployment and a statefulset.
|
|
|
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
|
|
|
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
|
|
|
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
|
|
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
|
|
|
# that remove resource metrics, ref:
|
|
|
|
# that remove resource metrics, ref:
|
|
|
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
|
|
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
|
|
@ -424,11 +424,11 @@ groups:
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
kube_pod_container_resource_requests_cpu_cores,
|
|
|
|
kube_pod_container_resource_requests_cpu_cores,
|
|
|
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
|
|
|
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
|
|
|
),
|
|
|
|
),
|
|
|
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
|
|
|
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
|
|
|
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
@ -439,17 +439,17 @@ groups:
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
kube_pod_container_resource_requests{resource="cpu"},
|
|
|
|
kube_pod_container_resource_requests{resource=\"cpu\"},
|
|
|
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
|
|
|
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
|
|
|
),
|
|
|
|
),
|
|
|
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
|
|
|
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
|
|
|
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
|
|
|
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
# Jobs should be sized to their CPU usage.
|
|
|
|
# Jobs should be sized to their CPU usage.
|
|
|
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
|
|
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
|
|
|
# their current provisioned #replicas and resource requests.
|
|
|
|
# their current provisioned #replicas and resource requests.
|
|
|
@ -459,28 +459,28 @@ groups:
|
|
|
|
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
|
|
|
|
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
|
|
|
|
/
|
|
|
|
/
|
|
|
|
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
|
|
|
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
reason: cpu_usage
|
|
|
|
reason: "cpu_usage"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
|
|
|
|
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
|
|
|
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
|
|
|
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
container_memory_usage_bytes{image!=""},
|
|
|
|
container_memory_usage_bytes{image!=\"\"},
|
|
|
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
|
|
|
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
|
|
|
),
|
|
|
|
),
|
|
|
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
|
|
|
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
|
|
|
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
record: cluster_namespace_deployment:container_memory_usage_bytes:sum
|
|
|
|
record: "cluster_namespace_deployment:container_memory_usage_bytes:sum"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
# Convenience rule to get the Memory request for both a deployment and a statefulset.
|
|
|
|
# Convenience rule to get the Memory request for both a deployment and a statefulset.
|
|
|
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
|
|
|
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
|
|
|
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
|
|
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
|
|
|
# that remove resource metrics, ref:
|
|
|
|
# that remove resource metrics, ref:
|
|
|
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
|
|
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
|
|
@ -493,11 +493,11 @@ groups:
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
kube_pod_container_resource_requests_memory_bytes,
|
|
|
|
kube_pod_container_resource_requests_memory_bytes,
|
|
|
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
|
|
|
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
|
|
|
),
|
|
|
|
),
|
|
|
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
|
|
|
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
|
|
|
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
@ -508,17 +508,17 @@ groups:
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
sum by (cluster, namespace, deployment) (
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
label_replace(
|
|
|
|
kube_pod_container_resource_requests{resource="memory"},
|
|
|
|
kube_pod_container_resource_requests{resource=\"memory\"},
|
|
|
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
|
|
|
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
|
|
|
),
|
|
|
|
),
|
|
|
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
|
|
|
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
# always matches everything and the (optional) zone is not removed.
|
|
|
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
|
|
|
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
|
|
|
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
# Jobs should be sized to their Memory usage.
|
|
|
|
# Jobs should be sized to their Memory usage.
|
|
|
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
|
|
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
|
|
|
# their current provisioned #replicas and resource requests.
|
|
|
|
# their current provisioned #replicas and resource requests.
|
|
|
@ -528,44 +528,44 @@ groups:
|
|
|
|
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
|
|
|
|
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
|
|
|
|
/
|
|
|
|
/
|
|
|
|
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
|
|
|
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
|
|
|
)
|
|
|
|
)"
|
|
|
|
labels:
|
|
|
|
labels:
|
|
|
|
reason: memory_usage
|
|
|
|
reason: "memory_usage"
|
|
|
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
|
|
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
|
|
|
- name: mimir_alertmanager_rules
|
|
|
|
- name: "mimir_alertmanager_rules"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job, pod) (cortex_alertmanager_alerts)
|
|
|
|
sum by (cluster, job, pod) (cortex_alertmanager_alerts)"
|
|
|
|
record: cluster_job_pod:cortex_alertmanager_alerts:sum
|
|
|
|
record: "cluster_job_pod:cortex_alertmanager_alerts:sum"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job, pod) (cortex_alertmanager_silences)
|
|
|
|
sum by (cluster, job, pod) (cortex_alertmanager_silences)"
|
|
|
|
record: cluster_job_pod:cortex_alertmanager_silences:sum
|
|
|
|
record: "cluster_job_pod:cortex_alertmanager_silences:sum"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))"
|
|
|
|
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m
|
|
|
|
record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))"
|
|
|
|
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m
|
|
|
|
record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
|
|
|
|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))"
|
|
|
|
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m
|
|
|
|
record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
|
|
|
|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))"
|
|
|
|
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m
|
|
|
|
record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))"
|
|
|
|
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m
|
|
|
|
record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))"
|
|
|
|
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m
|
|
|
|
record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))"
|
|
|
|
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m
|
|
|
|
record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m"
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
|
|
|
|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))"
|
|
|
|
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
|
|
|
|
record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m"
|
|
|
|
- name: mimir_ingester_rules
|
|
|
|
- name: "mimir_ingester_rules"
|
|
|
|
rules:
|
|
|
|
rules:
|
|
|
|
- expr: |
|
|
|
|
- expr: "|
|
|
|
|
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
|
|
|
|
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))"
|
|
|
|
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
|
|
|
|
record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m"
|
|
|
|