Compare commits

..

2 Commits

Author SHA1 Message Date
Michel Hollands
c95c0e2ca9 Add node exporter
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-02 14:32:27 +01:00
Michel Hollands
c288a80bd4 Add metrics scraping of cadvisor and kubelet
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-02 12:05:55 +01:00
23 changed files with 136 additions and 16384 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,53 +0,0 @@
groups:
- name: loki_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate

View File

@@ -1,571 +0,0 @@
groups:
- name: mimir_api_1
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
- name: mimir_api_2
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
- name: mimir_api_3
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
- name: mimir_querier_api
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
(cluster, job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
by (cluster, namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate
- name: mimir_cache
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
by (cluster, job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds:avg
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method)
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
- name: mimir_storage
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds:avg
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
- name: mimir_queries
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:50quantile
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
by (cluster, job)
record: cluster_job:cortex_query_frontend_retries:avg
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
(cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
- name: mimir_ingester_queries
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:50quantile
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_series:avg
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:50quantile
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples:avg
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_exemplars:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_exemplars:50quantile
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_exemplars:avg
- expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate
- name: mimir_received_samples
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m
- name: mimir_exemplars_in
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))
record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m
- name: mimir_received_exemplars
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))
record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m
- name: mimir_exemplars_ingested
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))
record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m
- name: mimir_exemplars_appended
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))
record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m
- name: mimir_scaling_rules
rules:
- expr: |
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
sum by (cluster, namespace, deployment) (
label_replace(
kube_deployment_spec_replicas,
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
or
sum by (cluster, namespace, deployment) (
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
)
record: cluster_namespace_deployment:actual_replicas:count
- expr: |
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
)[24h:]
)
/ 240000
)
labels:
deployment: distributor
reason: sample_rate
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
* 0.59999999999999998 / 240000
)
labels:
deployment: distributor
reason: sample_rate_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
)[24h:]
)
* 3 / 80000
)
labels:
deployment: ingester
reason: sample_rate
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
quantile_over_time(0.99,
sum by(cluster, namespace) (
cortex_ingester_memory_series
)[24h:]
)
/ 1500000
)
labels:
deployment: ingester
reason: active_series
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
* 3 * 0.59999999999999998 / 1500000
)
labels:
deployment: ingester
reason: active_series_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
* 0.59999999999999998 / 80000
)
labels:
deployment: ingester
reason: sample_rate_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
(sum by (cluster, namespace) (
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
) / 4)
/
avg by (cluster, namespace) (
memcached_limit_bytes{job=~".+/memcached"}
)
)
labels:
deployment: memcached
reason: active_series
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate
- expr: |
# Convenience rule to get the CPU request for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
#
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
# where kube_pod_container_resource_requests_cpu_cores was removed:
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests_cpu_cores,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
or
# This expression is compatible with kube-state-metrics >= v1.4.0,
# where kube_pod_container_resource_requests was introduced.
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="cpu"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
- expr: |
# Jobs should be sized to their CPU usage.
# We do this by comparing 99th percentile usage over the last 24hrs to
# their current provisioned #replicas and resource requests.
ceil(
cluster_namespace_deployment:actual_replicas:count
*
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
/
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
)
labels:
reason: cpu_usage
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
container_memory_usage_bytes{image!=""},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
record: cluster_namespace_deployment:container_memory_usage_bytes:sum
- expr: |
# Convenience rule to get the Memory request for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
#
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
# where kube_pod_container_resource_requests_memory_bytes was removed:
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests_memory_bytes,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
or
# This expression is compatible with kube-state-metrics >= v1.4.0,
# where kube_pod_container_resource_requests was introduced.
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="memory"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
- expr: |
# Jobs should be sized to their Memory usage.
# We do this by comparing 99th percentile usage over the last 24hrs to
# their current provisioned #replicas and resource requests.
ceil(
cluster_namespace_deployment:actual_replicas:count
*
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
/
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
)
labels:
reason: memory_usage
record: cluster_namespace_deployment_reason:required_replicas:count
- name: mimir_alertmanager_rules
rules:
- expr: |
sum by (cluster, job, pod) (cortex_alertmanager_alerts)
record: cluster_job_pod:cortex_alertmanager_alerts:sum
- expr: |
sum by (cluster, job, pod) (cortex_alertmanager_silences)
record: cluster_job_pod:cortex_alertmanager_silences:sum
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m
- expr: |
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m
- expr: |
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
- name: mimir_ingester_rules
rules:
- expr: |
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m

View File

@@ -1,15 +0,0 @@
groups:
- name: tempo_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
record: cluster_namespace_job_route:tempo_request_duration_seconds:avg
- expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)
record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)
record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate
- expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate

View File

@@ -37,6 +37,8 @@ data:
}
}
// Logs
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
loki.source.kubernetes "pods" {
targets = discovery.relabel.rename_meta_labels.output
@@ -58,6 +60,8 @@ data:
{{- end }}
{{- end }}
// Metrics
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
prometheus.scrape "pods" {
targets = discovery.relabel.rename_meta_labels.output
@@ -70,8 +74,117 @@ data:
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
}
{{- end }}
// cAdvisor and Kubelete metrics
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
discovery.kubernetes "all_nodes" {
role = "node"
}
discovery.relabel "all_nodes" {
targets = discovery.kubernetes.all_nodes.targets
rule {
source_labels = ["__meta_kubernetes_node_name"]
target_label = "node"
}
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
separator = "/"
regex = "(.*)/(.*)/(.*)"
replacement = "${1}/${2}-${3}"
target_label = "job"
}
rule {
target_label = "cluster"
replacement = "{{- .Values.clusterName -}}"
}
}
prometheus.scrape "cadvisor" {
targets = discovery.relabel.all_nodes.output
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
scrape_interval = "15s"
metrics_path = "/metrics/cadvisor"
scheme = "https"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
}
}
prometheus.scrape "kubelet" {
targets = discovery.relabel.all_nodes.output
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
scrape_interval = "15s"
metrics_path = "/metrics"
scheme = "https"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
}
}
prometheus.exporter.unix {}
prometheus.scrape "node_exporter" {
targets = prometheus.exporter.unix.targets
forward_to = [prometheus.relabel.node_exporter.receiver]
job_name = "node-exporter"
scrape_interval = "15s"
}
prometheus.relabel "node_exporter" {
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
rule {
replacement = env("HOSTNAME")
target_label = "nodename"
}
rule {
replacement = "node-exporter"
target_label = "job"
}
rule {
source_labels = ["__meta_kubernetes_node_name"]
target_label = "node"
}
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
separator = "/"
regex = "(.*)/(.*)/(.*)"
replacement = "${1}/${2}-${3}"
target_label = "job"
}
rule {
target_label = "cluster"
replacement = "{{- .Values.clusterName -}}"
}
}
{{- end }}
// Traces
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
otelcol.receiver.otlp "otlp_receiver" {

View File

@@ -1,4 +1,4 @@
{{- if .Values.dashboards.logs.enabled }}
{{- if .Values.local.logs.enabled }}
---
apiVersion: v1
kind: ConfigMap

View File

@@ -1,4 +1,4 @@
{{- if .Values.dashboards.logs.enabled }}
{{- if .Values.local.logs.enabled }}
---
apiVersion: v1
kind: ConfigMap

View File

@@ -1,4 +1,4 @@
{{- if .Values.dashboards.metrics.enabled }}
{{- if .Values.local.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap

View File

@@ -1,4 +1,4 @@
{{- if .Values.dashboards.metrics.enabled }}
{{- if .Values.local.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap

View File

@@ -1,4 +1,4 @@
{{- if .Values.dashboards.metrics.enabled }}
{{- if .Values.local.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap

View File

@@ -1,4 +1,4 @@
{{- if .Values.dashboards.metrics.enabled }}
{{- if .Values.local.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap

View File

@@ -1,4 +1,4 @@
{{- if .Values.dashboards.metrics.enabled }}
{{- if .Values.local.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap

View File

@@ -1,16 +1,16 @@
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
{{- if or (or .Values.local.logs.enabled .Values.local.metrics.enabled) .Values.local.traces.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: dashboards-provisioning
name: loki-dashboards-provisioning
namespace: {{ $.Release.Namespace }}
data:
dashboards.yaml: |
---
apiVersion: 1
providers:
{{- if .Values.dashboards.logs.enabled }}
{{- if .Values.local.logs.enabled }}
- disableDeletion: true
editable: false
folder: Loki
@@ -28,7 +28,7 @@ data:
orgId: 1
type: file
{{- end }}
{{- if .Values.dashboards.metrics.enabled }}
{{- if .Values.local.metrics.enabled }}
- disableDeletion: true
editable: false
folder: Mimir
@@ -70,14 +70,4 @@ data:
orgId: 1
type: file
{{- end }}
{{- if .Values.dashboards.traces.enabled }}
- disableDeletion: true
editable: false
folder: Tempo
name: tempo-1
options:
path: /var/lib/grafana/dashboards/tempo-1
orgId: 1
type: file
{{- end }}
{{- end }}

View File

@@ -65,17 +65,15 @@ spec:
name: grafana-pv
- mountPath: /etc/grafana/provisioning/datasources
name: datasources-provisioning
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
- mountPath: /etc/grafana/provisioning/dashboards
name: dashboards-provisioning
{{- end }}
{{- if .Values.dashboards.logs.enabled }}
name: loki-dashboards-provisioning
{{- if .Values.local.logs.enabled }}
- mountPath: /var/lib/grafana/dashboards/loki-1
name: loki-dashboards-1
- mountPath: /var/lib/grafana/dashboards/loki-2
name: loki-dashboards-2
{{- end }}
{{- if .Values.dashboards.metrics.enabled }}
{{- if .Values.local.metrics.enabled }}
- mountPath: /var/lib/grafana/dashboards/mimir-1
name: mimir-dashboards-1
- mountPath: /var/lib/grafana/dashboards/mimir-2
@@ -87,10 +85,6 @@ spec:
- mountPath: /var/lib/grafana/dashboards/mimir-5
name: mimir-dashboards-5
{{- end }}
{{- if .Values.dashboards.traces.enabled }}
- mountPath: /var/lib/grafana/dashboards/tempo-1
name: tempo-dashboards-1
{{- end }}
volumes:
- name: grafana-pv
persistentVolumeClaim:
@@ -98,10 +92,10 @@ spec:
- name: datasources-provisioning
configMap:
name: datasources-provisioning
- name: dashboards-provisioning
{{- if .Values.local.logs.enabled }}
- name: loki-dashboards-provisioning
configMap:
name: dashboards-provisioning
{{- if .Values.dashboards.logs.enabled }}
name: loki-dashboards-provisioning
- name: loki-dashboards-1
configMap:
name: loki-dashboards-1
@@ -109,7 +103,10 @@ spec:
configMap:
name: loki-dashboards-2
{{- end }}
{{- if .Values.dashboards.metrics.enabled }}
{{- if .Values.local.metrics.enabled }}
- name: mimir-dashboards-provisioning
configMap:
name: mimir-dashboards-provisioning
- name: mimir-dashboards-1
configMap:
name: mimir-dashboards-1
@@ -126,11 +123,6 @@ spec:
configMap:
name: mimir-dashboards-5
{{- end }}
{{- if .Values.dashboards.traces.enabled }}
- name: tempo-dashboards-1
configMap:
name: tempo-dashboards-1
{{- end }}
---
apiVersion: v1
@@ -146,4 +138,4 @@ spec:
app: grafana
sessionAffinity: None
type: ClusterIP # Make this configurable
{{- end }}
{{- end }}

View File

@@ -1,21 +0,0 @@
{{- if .Values.dashboards.traces.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: tempo-dashboards-1
namespace: {{ $.Release.Namespace }}
data:
"tempo-operational.json": |
{{ $.Files.Get "src/dashboards/tempo-operational.json" | fromJson | toJson }}
"tempo-reads.json": |
{{ $.Files.Get "src/dashboards/tempo-reads.json" | fromJson | toJson }}
"tempo-resources.json": |
{{ $.Files.Get "src/dashboards/tempo-resources.json" | fromJson | toJson }}
"tempo-rollout-progress.json": |
{{ $.Files.Get "src/dashboards/tempo-rollout-progress.json" | fromJson | toJson }}
"tempo-tenants.json": |
{{ $.Files.Get "src/dashboards/tempo-tenants.json" | fromJson | toJson }}
"tempo-writes.json": |
{{ $.Files.Get "src/dashboards/tempo-writes.json" | fromJson | toJson }}
{{- end }}

View File

@@ -1,126 +0,0 @@
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: meta-mimir-ruler-for-dashboards
namespace: meta
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/component: ruler-for-dashboards
app.kubernetes.io/instance: meta
app.kubernetes.io/name: mimir
strategy:
rollingUpdate:
maxSurge: 50%
maxUnavailable: 0
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/component: ruler-for-dashboards
app.kubernetes.io/instance: meta
app.kubernetes.io/name: mimir
namespace: meta
spec:
containers:
- args:
- -target=ruler
- -log.level=debug
- -ruler-storage.backend=local
- -ruler-storage.local.directory=/etc/rules
- -ruler.ring.prefix=dashboards/
- -config.expand-env=true
- -config.file=/etc/mimir/mimir.yaml
image: grafana/mimir:2.8.0
imagePullPolicy: IfNotPresent
name: ruler
ports:
- containerPort: 8080
name: http-metrics
protocol: TCP
- containerPort: 9095
name: grpc
protocol: TCP
- containerPort: 7946
name: memberlist
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /ready
port: http-metrics
scheme: HTTP
initialDelaySeconds: 45
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
requests:
cpu: 100m
memory: 128Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/mimir
name: config
- mountPath: /var/mimir
name: runtime-config
- mountPath: /data
name: storage
- mountPath: /active-query-tracker
name: active-queries
- mountPath: /etc/rules/anonymous
name: rules
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: default-scheduler
securityContext:
fsGroup: 10001
runAsGroup: 10001
runAsNonRoot: true
runAsUser: 10001
seccompProfile:
type: RuntimeDefault
serviceAccount: meta-mimir
serviceAccountName: meta-mimir
terminationGracePeriodSeconds: 180
topologySpreadConstraints:
- labelSelector:
matchLabels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: meta
app.kubernetes.io/name: mimir
maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
volumes:
- configMap:
defaultMode: 420
items:
- key: mimir.yaml
path: mimir.yaml
name: meta-mimir-config
name: config
- configMap:
defaultMode: 420
name: meta-mimir-runtime
name: runtime-config
- emptyDir: {}
name: storage
- emptyDir: {}
name: active-queries
- configMap:
defaultMode: 420
name: rules
name: rules
{{- end }}

View File

@@ -1,18 +0,0 @@
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: rules
namespace: {{ $.Release.Namespace }}
data:
{{- if .Values.dashboards.logs.enabled }}
{{ ($.Files.Glob "src/rules/loki-rules.yaml").AsConfig | indent 2 }}
{{- end }}
{{- if .Values.dashboards.metrics.enabled }}
{{ ($.Files.Glob "src/rules/mimir-rules.yaml").AsConfig | indent 2 }}
{{- end }}
{{- if .Values.dashboards.traces.enabled }}
{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }}
{{- end }}
{{- end }}

View File

@@ -14,6 +14,7 @@ local:
minio:
enabled: false # This should be set to true if any of the previous is enabled
cloud:
logs:
enabled: true
@@ -40,15 +41,6 @@ logs:
# source: "" # Empty uses the log message
# replace: "*****""
# Set enabled = true to add the default logs/metrics/traces dashboards to the local Grafana
dashboards:
logs:
enabled: true
metrics:
enabled: true
traces:
enabled: true
global:
minio:
rootUser: "rootuser"