diff --git a/charts/meta-monitoring/src/rules/loki-rules.yaml b/charts/meta-monitoring/src/rules/loki-rules.yaml new file mode 100644 index 0000000..33196c1 --- /dev/null +++ b/charts/meta-monitoring/src/rules/loki-rules.yaml @@ -0,0 +1,53 @@ +groups: +- name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate \ No newline at end of file diff --git a/charts/meta-monitoring/src/rules/mimir-rules.yaml b/charts/meta-monitoring/src/rules/mimir-rules.yaml new file mode 100644 index 0000000..1344cee --- /dev/null +++ b/charts/meta-monitoring/src/rules/mimir-rules.yaml @@ -0,0 +1,571 @@ +groups: +- name: mimir_api_1 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_count:sum_rate +- name: mimir_api_2 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate +- name: mimir_api_3 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate +- name: mimir_querier_api + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + job) + record: cluster_job:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + job) + record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job) + record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + job) + record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by + (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) + by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate +- name: mimir_cache + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method)) + record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method)) + record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile + - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, + job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) + by (cluster, job, method) + record: cluster_job_method:cortex_memcache_request_duration_seconds:avg + - expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, + job, method) + record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, + job, method) + record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, + job, method) + record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_cache_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_cache_request_duration_seconds:50quantile + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) + / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_cache_request_duration_seconds:avg + - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, + job) + record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + job) + record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method)) + record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method)) + record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + job, method) + record: cluster_job_method:cortex_cache_request_duration_seconds:avg + - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, + job, method) + record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + method) + record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + job, method) + record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate +- name: mimir_storage + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:50quantile + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds:avg + - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, + job) + record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate +- name: mimir_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:50quantile + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) + by (cluster, job) + record: cluster_job:cortex_query_frontend_retries:avg + - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, + job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by + (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, + cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, + job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, + job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate +- name: mimir_ingester_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:50quantile + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) + by (cluster, job) + record: cluster_job:cortex_ingester_queried_series:avg + - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:50quantile + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) + by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples:avg + - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:50quantile + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / + sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars:avg + - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, + job) + record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate +- name: mimir_received_samples + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_samples:rate5m +- name: mimir_exemplars_in + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) + record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m +- name: mimir_received_exemplars + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m +- name: mimir_exemplars_ingested + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) + record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m +- name: mimir_exemplars_appended + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) + record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m +- name: mimir_scaling_rules + rules: + - expr: | + # Convenience rule to get the number of replicas for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + sum by (cluster, namespace, deployment) ( + label_replace( + kube_deployment_spec_replicas, + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + or + sum by (cluster, namespace, deployment) ( + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") + ) + record: cluster_namespace_deployment:actual_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + / 240000 + ) + labels: + deployment: distributor + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 240000 + ) + labels: + deployment: distributor + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + * 3 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by(cluster, namespace) ( + cortex_ingester_memory_series + )[24h:] + ) + / 1500000 + ) + labels: + deployment: ingester + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) + * 3 * 0.59999999999999998 / 1500000 + ) + labels: + deployment: ingester + reason: active_series_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + (sum by (cluster, namespace) ( + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} + ) / 4) + / + avg by (cluster, namespace) ( + memcached_limit_bytes{job=~".+/memcached"} + ) + ) + labels: + deployment: memcached + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])), + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate + - expr: | + # Convenience rule to get the CPU request for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 + # that remove resource metrics, ref: + # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 + # - https://github.com/kubernetes/kube-state-metrics/pull/1004 + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_cpu_cores was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests{resource="cpu"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + # Jobs should be sized to their CPU usage. + # We do this by comparing 99th percentile usage over the last 24hrs to + # their current provisioned #replicas and resource requests. + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + ) + labels: + reason: cpu_usage + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + # Convenience rule to get the Memory utilization for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + container_memory_usage_bytes{image!=""}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + record: cluster_namespace_deployment:container_memory_usage_bytes:sum + - expr: | + # Convenience rule to get the Memory request for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 + # that remove resource metrics, ref: + # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 + # - https://github.com/kubernetes/kube-state-metrics/pull/1004 + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_memory_bytes was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests{resource="memory"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + # Jobs should be sized to their Memory usage. + # We do this by comparing 99th percentile usage over the last 24hrs to + # their current provisioned #replicas and resource requests. + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + ) + labels: + reason: memory_usage + record: cluster_namespace_deployment_reason:required_replicas:count +- name: mimir_alertmanager_rules + rules: + - expr: | + sum by (cluster, job, pod) (cortex_alertmanager_alerts) + record: cluster_job_pod:cortex_alertmanager_alerts:sum + - expr: | + sum by (cluster, job, pod) (cortex_alertmanager_silences) + record: cluster_job_pod:cortex_alertmanager_silences:sum + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m +- name: mimir_ingester_rules + rules: + - expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) + record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m \ No newline at end of file diff --git a/charts/meta-monitoring/src/rules/tempo-rules.yaml b/charts/meta-monitoring/src/rules/tempo-rules.yaml new file mode 100644 index 0000000..59f8f17 --- /dev/null +++ b/charts/meta-monitoring/src/rules/tempo-rules.yaml @@ -0,0 +1,15 @@ +groups: +- name: tempo_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile + - expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:tempo_request_duration_seconds:avg + - expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate + - expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate \ No newline at end of file diff --git a/charts/meta-monitoring/templates/ruler/ruler.yaml b/charts/meta-monitoring/templates/ruler/ruler.yaml new file mode 100644 index 0000000..9fae6e4 --- /dev/null +++ b/charts/meta-monitoring/templates/ruler/ruler.yaml @@ -0,0 +1,126 @@ +{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: meta-mimir-ruler-for-dashboards + namespace: meta +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app.kubernetes.io/component: ruler-for-dashboards + app.kubernetes.io/instance: meta + app.kubernetes.io/name: mimir + strategy: + rollingUpdate: + maxSurge: 50% + maxUnavailable: 0 + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/component: ruler-for-dashboards + app.kubernetes.io/instance: meta + app.kubernetes.io/name: mimir + namespace: meta + spec: + containers: + - args: + - -target=ruler + - -log.level=debug + - -ruler-storage.backend=local + - -ruler-storage.local.directory=/etc/rules + - -ruler.ring.prefix=dashboards/ + - -config.expand-env=true + - -config.file=/etc/mimir/mimir.yaml + image: grafana/mimir:2.8.0 + imagePullPolicy: IfNotPresent + name: ruler + ports: + - containerPort: 8080 + name: http-metrics + protocol: TCP + - containerPort: 9095 + name: grpc + protocol: TCP + - containerPort: 7946 + name: memberlist + protocol: TCP + readinessProbe: + failureThreshold: 3 + httpGet: + path: /ready + port: http-metrics + scheme: HTTP + initialDelaySeconds: 45 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: 100m + memory: 128Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /etc/mimir + name: config + - mountPath: /var/mimir + name: runtime-config + - mountPath: /data + name: storage + - mountPath: /active-query-tracker + name: active-queries + - mountPath: /etc/rules/anonymous + name: rules + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + seccompProfile: + type: RuntimeDefault + serviceAccount: meta-mimir + serviceAccountName: meta-mimir + terminationGracePeriodSeconds: 180 + topologySpreadConstraints: + - labelSelector: + matchLabels: + app.kubernetes.io/component: ruler + app.kubernetes.io/instance: meta + app.kubernetes.io/name: mimir + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + defaultMode: 420 + items: + - key: mimir.yaml + path: mimir.yaml + name: meta-mimir-config + name: config + - configMap: + defaultMode: 420 + name: meta-mimir-runtime + name: runtime-config + - emptyDir: {} + name: storage + - emptyDir: {} + name: active-queries + - configMap: + defaultMode: 420 + name: rules + name: rules +{{- end }} diff --git a/charts/meta-monitoring/templates/ruler/rules-configmap.yaml b/charts/meta-monitoring/templates/ruler/rules-configmap.yaml new file mode 100644 index 0000000..c3396ef --- /dev/null +++ b/charts/meta-monitoring/templates/ruler/rules-configmap.yaml @@ -0,0 +1,18 @@ +{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rules + namespace: {{ $.Release.Namespace }} +data: +{{- if .Values.dashboards.logs.enabled }} +{{ ($.Files.Glob "src/rules/loki-rules.yaml").AsConfig | indent 2 }} +{{- end }} +{{- if .Values.dashboards.metrics.enabled }} +{{ ($.Files.Glob "src/rules/mimir-rules.yaml").AsConfig | indent 2 }} +{{- end }} +{{- if .Values.dashboards.traces.enabled }} +{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }} +{{- end }} +{{- end }}