Compare commits

...

16 Commits
main ... add_ci

Author SHA1 Message Date
Michel Hollands
c9295f51e2 Rename agent dashboard file
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-17 09:40:25 +01:00
Michel Hollands
1eafbd4904 Fix linting issues
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 17:19:09 +01:00
Michel Hollands
70312c135d Do local install
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 17:14:44 +01:00
Michel Hollands
1b3b89df42 Increase timeout
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 17:13:15 +01:00
Michel Hollands
419cdede3d Remove hardcoded meta namespace
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 16:59:42 +01:00
Michel Hollands
a3d27c1c3a Add default values in values.yaml
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 16:48:41 +01:00
Michel Hollands
4cf4d13955 Remove prometheus operator
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 16:43:37 +01:00
Michel Hollands
3a59a6bc82 Fix linting issues again 2
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 16:31:48 +01:00
Michel Hollands
e9b05c3856 Fix linting issues again
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 16:29:38 +01:00
Michel Hollands
f9b6ae9b3e Fix linting issues
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 16:15:00 +01:00
Michel Hollands
4534f3eb21 Update chart dependencies
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 16:09:22 +01:00
Michel Hollands
b7dc7212e9 Update path for CI
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 14:38:18 +01:00
Michel Hollands
ed6c6da4a0 Add ct.yaml file
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 14:31:55 +01:00
Michel Hollands
9c67417c02 Fix path again
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 14:11:09 +01:00
Michel Hollands
9e136cdc5e Fix paths
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 14:08:11 +01:00
Michel Hollands
46dd6ffeb3 Add ci steps
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 14:06:07 +01:00
13 changed files with 620 additions and 487 deletions

101
.github/workflows/helm-ci.yml vendored Normal file
View File

@ -0,0 +1,101 @@
---
name: helm-ci
on:
pull_request:
paths:
- "charts/meta-monitoring/**"
env:
CT_CONFIGFILE: charts/meta-monitoring/ct.yaml
jobs:
call-lint:
name: Lint Helm Chart
runs-on: ubuntu-latest
steps:
- name: Checkout Code
uses: actions/checkout@v3
# - name: Check Docs
# run: |
# docker run --rm --volume "$(pwd):/helm-docs" -u "$(id -u)" jnorwood/helm-docs:v1.11.0
# if ! git diff --exit-code; then
# echo "Documentation not up to date. Please run helm-docs and commit changes!" >&2
# exit 1
# fi
- name: Lint Yaml
run: make helm-lint
# - name: Lint Code Base
# uses: docker://github/super-linter:v3.12.0
# env:
# FILTER_REGEX_EXCLUDE: .*(README\.md|Chart\.yaml|NOTES.txt).*
# FILTER_REGEX_INCLUDE: .*charts/meta-monitoring/.*
# VALIDATE_ALL_CODEBASE: false
# VALIDATE_KUBERNETES_KUBEVAL: false
# VALIDATE_YAML: false
# VALIDATE_GO: false
# DEFAULT_BRANCH: main
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
call-test:
name: Test Helm Chart
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.8.2
# Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and
# yamllint (https://github.com/adrienverge/yamllint) which require Python
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.7
- name: Set up chart-testing
uses: helm/chart-testing-action@v2.4.0
- name: Run chart-testing (list-changed)
id: list-changed
run: |
changed=$(ct list-changed --config "${CT_CONFIGFILE}")
if [[ -n "$changed" ]]; then
echo "changed=true" >> $GITHUB_OUTPUT
fi
- name: Run chart-testing (lint)
run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false
- name: Create kind cluster
uses: helm/kind-action@v1.8.0
if: steps.list-changed.outputs.changed == 'true'
# - name: Install prometheus operator
# id: install-prometheus
# if: steps.list-changed.outputs.changed == 'true'
# run: |
# kubectl create namespace prometheus
# helm install prometheus prometheus-community/kube-prometheus-stack \
# --namespace prometheus \
# --set grafana.enabled=false \
# --set prometheus.prometheusSpec.serviceMonitorSelector.matchLabels.release=prometheus
# kubectl --namespace prometheus get pods -l "release=prometheus"
# kubectl --namespace prometheus get services -l "release=prometheus"
- name: Run chart-testing (install)
run: |
changed=$(ct list-changed --config "${CT_CONFIGFILE}")
if [[ "$changed" == "charts/enterprise-metrics" ]]; then
# Do not run `ct install` for enterprise-metrics
exit 0
fi
ct install --config "${CT_CONFIGFILE}"

10
Makefile Normal file
View File

@ -0,0 +1,10 @@
# Adapted from https://www.thapaliya.com/en/writings/well-documented-makefiles/
.PHONY: help
help: ## Display this help and any documented user-facing targets. Other undocumented targets may be present in the Makefile.
help:
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make <target>\n\nTargets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf " %-45s %s\n", $$1, $$2 }' $(MAKEFILE_LIST)
.PHONY: helm-lint
helm-lint: ## run helm linter
$(MAKE) -BC charts/meta-monitoring lint

View File

@ -14,5 +14,5 @@ dependencies:
- name: minio - name: minio
repository: https://charts.min.io repository: https://charts.min.io
version: 5.0.11 version: 5.0.11
digest: sha256:4b04084e6fe821c4d481017b2430f7c8cd782a5d60830dd3a24eb8f10a9ece09 digest: sha256:da0e744b5046eb7972e0bf82d1d0ba4786e9600af63b65f35b16118105248074
generated: "2023-06-29T14:25:07.247853+01:00" generated: "2023-08-16T16:08:36.406791+01:00"

View File

@ -0,0 +1,7 @@
.DEFAULT_GOAL := all
.PHONY: lint lint-yaml
lint: lint-yaml
lint-yaml:
yamllint -c $(CURDIR)/src/.yamllint.yaml $(CURDIR)/src

View File

@ -0,0 +1,11 @@
---
remote: origin
target-branch: main
chart-dirs:
- charts
chart-repos:
- grafana=https://grafana.github.io/helm-charts
- minio=https://charts.min.io
helm-extra-args: --timeout 1200s
check-version-increment: false
validate-maintainers: false

View File

@ -0,0 +1,4 @@
---
rules:
quoted-strings:
required: true

View File

@ -1,53 +1,53 @@
groups: groups:
- name: loki_rules - name: "loki_rules"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:loki_request_duration_seconds:99quantile record: "cluster_job:loki_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:loki_request_duration_seconds:50quantile record: "cluster_job:loki_request_duration_seconds:50quantile"
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (cluster, job) by (cluster, job)"
record: cluster_job:loki_request_duration_seconds:avg record: "cluster_job:loki_request_duration_seconds:avg"
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)"
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate record: "cluster_job:loki_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)"
record: cluster_job:loki_request_duration_seconds_sum:sum_rate record: "cluster_job:loki_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)"
record: cluster_job:loki_request_duration_seconds_count:sum_rate record: "cluster_job:loki_request_duration_seconds_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route)) by (le, cluster, job, route))"
record: cluster_job_route:loki_request_duration_seconds:99quantile record: "cluster_job_route:loki_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route)) by (le, cluster, job, route))"
record: cluster_job_route:loki_request_duration_seconds:50quantile record: "cluster_job_route:loki_request_duration_seconds:50quantile"
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)"
record: cluster_job_route:loki_request_duration_seconds:avg record: "cluster_job_route:loki_request_duration_seconds:avg"
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route) route)"
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate record: "cluster_job_route:loki_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)"
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate record: "cluster_job_route:loki_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)"
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate record: "cluster_job_route:loki_request_duration_seconds_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route)) by (le, cluster, namespace, job, route))"
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile record: "cluster_namespace_job_route:loki_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route)) by (le, cluster, namespace, job, route))"
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile record: "cluster_namespace_job_route:loki_request_duration_seconds:50quantile"
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route) namespace, job, route)"
record: cluster_namespace_job_route:loki_request_duration_seconds:avg record: "cluster_namespace_job_route:loki_request_duration_seconds:avg"
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route) job, route)"
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate record: "cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) job, route)"
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate record: "cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route) job, route)"
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate record: "cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate"

View File

@ -1,322 +1,322 @@
groups: groups:
- name: mimir_api_1 - name: "mimir_api_1"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_request_duration_seconds:99quantile record: "cluster_job:cortex_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_request_duration_seconds:50quantile record: "cluster_job:cortex_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
by (cluster, job) by (cluster, job)"
record: cluster_job:cortex_request_duration_seconds:avg record: "cluster_job:cortex_request_duration_seconds:avg"
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)"
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)"
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)"
record: cluster_job:cortex_request_duration_seconds_count:sum_rate record: "cluster_job:cortex_request_duration_seconds_count:sum_rate"
- name: mimir_api_2 - name: "mimir_api_2"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route)) by (le, cluster, job, route))"
record: cluster_job_route:cortex_request_duration_seconds:99quantile record: "cluster_job_route:cortex_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route)) by (le, cluster, job, route))"
record: cluster_job_route:cortex_request_duration_seconds:50quantile record: "cluster_job_route:cortex_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)"
record: cluster_job_route:cortex_request_duration_seconds:avg record: "cluster_job_route:cortex_request_duration_seconds:avg"
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route) route)"
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)"
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)"
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate"
- name: mimir_api_3 - name: "mimir_api_3"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route)) by (le, cluster, namespace, job, route))"
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route)) by (le, cluster, namespace, job, route))"
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route) namespace, job, route)"
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg"
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route) job, route)"
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) job, route)"
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route) job, route)"
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate"
- name: mimir_querier_api - name: "mimir_querier_api"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_querier_request_duration_seconds:99quantile record: "cluster_job:cortex_querier_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_querier_request_duration_seconds:50quantile record: "cluster_job:cortex_querier_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job) job)"
record: cluster_job:cortex_querier_request_duration_seconds:avg record: "cluster_job:cortex_querier_request_duration_seconds:avg"
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job) job)"
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job) job)"
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job) job)"
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route)) by (le, cluster, job, route))"
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route)) by (le, cluster, job, route))"
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
(cluster, job, route) (cluster, job, route)"
record: cluster_job_route:cortex_querier_request_duration_seconds:avg record: "cluster_job_route:cortex_querier_request_duration_seconds:avg"
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job, route) job, route)"
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route) job, route)"
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job, route) job, route)"
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route)) by (le, cluster, namespace, job, route))"
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route)) by (le, cluster, namespace, job, route))"
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
by (cluster, namespace, job, route) by (cluster, namespace, job, route)"
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg"
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
namespace, job, route) namespace, job, route)"
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route) namespace, job, route)"
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route) namespace, job, route)"
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
- name: mimir_cache - name: "mimir_cache"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method)) by (le, cluster, job, method))"
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method)) by (le, cluster, job, method))"
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
by (cluster, job, method) by (cluster, job, method)"
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg"
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, - expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method) job, method)"
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method) job, method)"
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, - expr: "sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
job, method) job, method)"
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_cache_request_duration_seconds:99quantile record: "cluster_job:cortex_cache_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_cache_request_duration_seconds:50quantile record: "cluster_job:cortex_cache_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)"
record: cluster_job:cortex_cache_request_duration_seconds:avg record: "cluster_job:cortex_cache_request_duration_seconds:avg"
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, - expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job) job)"
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)"
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, - expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job) job)"
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method)) by (le, cluster, job, method))"
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method)) by (le, cluster, job, method))"
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method) job, method)"
record: cluster_job_method:cortex_cache_request_duration_seconds:avg record: "cluster_job_method:cortex_cache_request_duration_seconds:avg"
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, - expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method) job, method)"
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method) method)"
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, - expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method) job, method)"
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate"
- name: mimir_storage - name: "mimir_storage"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_kv_request_duration_seconds:99quantile record: "cluster_job:cortex_kv_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_kv_request_duration_seconds:50quantile record: "cluster_job:cortex_kv_request_duration_seconds:50quantile"
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)"
record: cluster_job:cortex_kv_request_duration_seconds:avg record: "cluster_job:cortex_kv_request_duration_seconds:avg"
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, - expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
job) job)"
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)"
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - expr: "sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)"
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate"
- name: mimir_queries - name: "mimir_queries"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_query_frontend_retries:99quantile record: "cluster_job:cortex_query_frontend_retries:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_query_frontend_retries:50quantile record: "cluster_job:cortex_query_frontend_retries:50quantile"
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) - expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
by (cluster, job) by (cluster, job)"
record: cluster_job:cortex_query_frontend_retries:avg record: "cluster_job:cortex_query_frontend_retries:avg"
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) - expr: "sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)"
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate"
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)"
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate"
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) - expr: "sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)"
record: cluster_job:cortex_query_frontend_retries_count:sum_rate record: "cluster_job:cortex_query_frontend_retries_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile"
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
(cluster, job) (cluster, job)"
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg"
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
cluster, job) cluster, job)"
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate"
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job) job)"
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate"
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
job) job)"
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate"
- name: mimir_ingester_queries - name: "mimir_ingester_queries"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_ingester_queried_series:99quantile record: "cluster_job:cortex_ingester_queried_series:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_ingester_queried_series:50quantile record: "cluster_job:cortex_ingester_queried_series:50quantile"
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) - expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
by (cluster, job) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_series:avg record: "cluster_job:cortex_ingester_queried_series:avg"
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) - expr: "sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)"
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate"
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate"
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) - expr: "sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_series_count:sum_rate record: "cluster_job:cortex_ingester_queried_series_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_ingester_queried_samples:99quantile record: "cluster_job:cortex_ingester_queried_samples:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_ingester_queried_samples:50quantile record: "cluster_job:cortex_ingester_queried_samples:50quantile"
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) - expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
by (cluster, job) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_samples:avg record: "cluster_job:cortex_ingester_queried_samples:avg"
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) - expr: "sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)"
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate"
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate"
- expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) - expr: "sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate"
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_ingester_queried_exemplars:99quantile record: "cluster_job:cortex_ingester_queried_exemplars:99quantile"
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
by (le, cluster, job)) by (le, cluster, job))"
record: cluster_job:cortex_ingester_queried_exemplars:50quantile record: "cluster_job:cortex_ingester_queried_exemplars:50quantile"
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / - expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_exemplars:avg record: "cluster_job:cortex_ingester_queried_exemplars:avg"
- expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, - expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
job) job)"
record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate"
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) - expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate"
- expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - expr: "sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)"
record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate"
- name: mimir_received_samples - name: "mimir_received_samples"
rules: rules:
- expr: | - expr: "|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))"
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m"
- name: mimir_exemplars_in - name: "mimir_exemplars_in"
rules: rules:
- expr: | - expr: "|
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))"
record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m"
- name: mimir_received_exemplars - name: "mimir_received_exemplars"
rules: rules:
- expr: | - expr: "|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))"
record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m"
- name: mimir_exemplars_ingested - name: "mimir_exemplars_ingested"
rules: rules:
- expr: | - expr: "|
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))"
record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m"
- name: mimir_exemplars_appended - name: "mimir_exemplars_appended"
rules: rules:
- expr: | - expr: "|
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))"
record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m"
- name: mimir_scaling_rules - name: "mimir_scaling_rules"
rules: rules:
- expr: | - expr: "|
# Convenience rule to get the number of replicas for both a deployment and a statefulset. # Convenience rule to get the number of replicas for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix. # Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
sum by (cluster, namespace, deployment) ( sum by (cluster, namespace, deployment) (
label_replace( label_replace(
kube_deployment_spec_replicas, kube_deployment_spec_replicas,
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed. # always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
) )
) )
or or
sum by (cluster, namespace, deployment) ( sum by (cluster, namespace, deployment) (
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") label_replace(kube_statefulset_replicas, \"deployment\", \"$1\", \"statefulset\", \"(.*?)(?:-zone-[a-z])?\")
) )"
record: cluster_namespace_deployment:actual_replicas:count record: "cluster_namespace_deployment:actual_replicas:count"
- expr: | - expr: "|
ceil( ceil(
quantile_over_time(0.99, quantile_over_time(0.99,
sum by (cluster, namespace) ( sum by (cluster, namespace) (
@ -324,21 +324,21 @@ groups:
)[24h:] )[24h:]
) )
/ 240000 / 240000
) )"
labels: labels:
deployment: distributor deployment: "distributor"
reason: sample_rate reason: "sample_rate"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: | - expr: "|
ceil( ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"})
* 0.59999999999999998 / 240000 * 0.59999999999999998 / 240000
) )"
labels: labels:
deployment: distributor deployment: "distributor"
reason: sample_rate_limits reason: "sample_rate_limits"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: | - expr: "|
ceil( ceil(
quantile_over_time(0.99, quantile_over_time(0.99,
sum by (cluster, namespace) ( sum by (cluster, namespace) (
@ -346,12 +346,12 @@ groups:
)[24h:] )[24h:]
) )
* 3 / 80000 * 3 / 80000
) )"
labels: labels:
deployment: ingester deployment: "ingester"
reason: sample_rate reason: "sample_rate"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: | - expr: "|
ceil( ceil(
quantile_over_time(0.99, quantile_over_time(0.99,
sum by(cluster, namespace) ( sum by(cluster, namespace) (
@ -359,59 +359,59 @@ groups:
)[24h:] )[24h:]
) )
/ 1500000 / 1500000
) )"
labels: labels:
deployment: ingester deployment: "ingester"
reason: active_series reason: "active_series"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: | - expr: "|
ceil( ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"max_global_series_per_user\"})
* 3 * 0.59999999999999998 / 1500000 * 3 * 0.59999999999999998 / 1500000
) )"
labels: labels:
deployment: ingester deployment: "ingester"
reason: active_series_limits reason: "active_series_limits"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: | - expr: "|
ceil( ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"})
* 0.59999999999999998 / 80000 * 0.59999999999999998 / 80000
) )"
labels: labels:
deployment: ingester deployment: "ingester"
reason: sample_rate_limits reason: "sample_rate_limits"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: | - expr: "|
ceil( ceil(
(sum by (cluster, namespace) ( (sum by (cluster, namespace) (
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} cortex_ingester_tsdb_storage_blocks_bytes{job=~\".+/ingester.*\"}
) / 4) ) / 4)
/ /
avg by (cluster, namespace) ( avg by (cluster, namespace) (
memcached_limit_bytes{job=~".+/memcached"} memcached_limit_bytes{job=~\".+/memcached\"}
)
) )
)"
labels: labels:
deployment: memcached deployment: "memcached"
reason: active_series reason: "active_series"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: | - expr: "|
sum by (cluster, namespace, deployment) ( sum by (cluster, namespace, deployment) (
label_replace( label_replace(
label_replace( label_replace(
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])), sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
), ),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed. # always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
) )
) )"
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate"
- expr: | - expr: "|
# Convenience rule to get the CPU request for both a deployment and a statefulset. # Convenience rule to get the CPU request for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix. # Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref: # that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
@ -424,11 +424,11 @@ groups:
label_replace( label_replace(
label_replace( label_replace(
kube_pod_container_resource_requests_cpu_cores, kube_pod_container_resource_requests_cpu_cores,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
), ),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed. # always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
) )
) )
) )
@ -439,17 +439,17 @@ groups:
sum by (cluster, namespace, deployment) ( sum by (cluster, namespace, deployment) (
label_replace( label_replace(
label_replace( label_replace(
kube_pod_container_resource_requests{resource="cpu"}, kube_pod_container_resource_requests{resource=\"cpu\"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
), ),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed. # always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
) )
) )
) )"
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum"
- expr: | - expr: "|
# Jobs should be sized to their CPU usage. # Jobs should be sized to their CPU usage.
# We do this by comparing 99th percentile usage over the last 24hrs to # We do this by comparing 99th percentile usage over the last 24hrs to
# their current provisioned #replicas and resource requests. # their current provisioned #replicas and resource requests.
@ -459,28 +459,28 @@ groups:
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
/ /
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
) )"
labels: labels:
reason: cpu_usage reason: "cpu_usage"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: | - expr: "|
# Convenience rule to get the Memory utilization for both a deployment and a statefulset. # Convenience rule to get the Memory utilization for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix. # Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
sum by (cluster, namespace, deployment) ( sum by (cluster, namespace, deployment) (
label_replace( label_replace(
label_replace( label_replace(
container_memory_usage_bytes{image!=""}, container_memory_usage_bytes{image!=\"\"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
), ),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed. # always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
) )
) )"
record: cluster_namespace_deployment:container_memory_usage_bytes:sum record: "cluster_namespace_deployment:container_memory_usage_bytes:sum"
- expr: | - expr: "|
# Convenience rule to get the Memory request for both a deployment and a statefulset. # Convenience rule to get the Memory request for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix. # Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref: # that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
@ -493,11 +493,11 @@ groups:
label_replace( label_replace(
label_replace( label_replace(
kube_pod_container_resource_requests_memory_bytes, kube_pod_container_resource_requests_memory_bytes,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
), ),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed. # always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
) )
) )
) )
@ -508,17 +508,17 @@ groups:
sum by (cluster, namespace, deployment) ( sum by (cluster, namespace, deployment) (
label_replace( label_replace(
label_replace( label_replace(
kube_pod_container_resource_requests{resource="memory"}, kube_pod_container_resource_requests{resource=\"memory\"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
), ),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed. # always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
) )
) )
) )"
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum"
- expr: | - expr: "|
# Jobs should be sized to their Memory usage. # Jobs should be sized to their Memory usage.
# We do this by comparing 99th percentile usage over the last 24hrs to # We do this by comparing 99th percentile usage over the last 24hrs to
# their current provisioned #replicas and resource requests. # their current provisioned #replicas and resource requests.
@ -528,44 +528,44 @@ groups:
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
/ /
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
) )"
labels: labels:
reason: memory_usage reason: "memory_usage"
record: cluster_namespace_deployment_reason:required_replicas:count record: "cluster_namespace_deployment_reason:required_replicas:count"
- name: mimir_alertmanager_rules - name: "mimir_alertmanager_rules"
rules: rules:
- expr: | - expr: "|
sum by (cluster, job, pod) (cortex_alertmanager_alerts) sum by (cluster, job, pod) (cortex_alertmanager_alerts)"
record: cluster_job_pod:cortex_alertmanager_alerts:sum record: "cluster_job_pod:cortex_alertmanager_alerts:sum"
- expr: | - expr: "|
sum by (cluster, job, pod) (cortex_alertmanager_silences) sum by (cluster, job, pod) (cortex_alertmanager_silences)"
record: cluster_job_pod:cortex_alertmanager_silences:sum record: "cluster_job_pod:cortex_alertmanager_silences:sum"
- expr: | - expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))"
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m"
- expr: | - expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))"
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m"
- expr: | - expr: "|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))"
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m"
- expr: | - expr: "|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))"
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m"
- expr: | - expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))"
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m"
- expr: | - expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))"
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m"
- expr: | - expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))"
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m"
- expr: | - expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))"
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m"
- name: mimir_ingester_rules - name: "mimir_ingester_rules"
rules: rules:
- expr: | - expr: "|
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))"
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m"

View File

@ -1,15 +1,15 @@
groups: groups:
- name: tempo_rules - name: "tempo_rules"
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))"
record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile"
- expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))"
record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile"
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)"
record: cluster_namespace_job_route:tempo_request_duration_seconds:avg record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg"
- expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) - expr: "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)"
record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate"
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)"
record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate"
- expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - expr: "sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)"
record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"

View File

@ -14,6 +14,6 @@ data:
{{ $.Files.Get "src/dashboards/agent-remote-write.json" | fromJson | toJson }} {{ $.Files.Get "src/dashboards/agent-remote-write.json" | fromJson | toJson }}
"agent-tracing-pipeline.json": | "agent-tracing-pipeline.json": |
{{ $.Files.Get "src/dashboards/agent-tracing-pipeline.json" | fromJson | toJson }} {{ $.Files.Get "src/dashboards/agent-tracing-pipeline.json" | fromJson | toJson }}
"agent.json": | "agent-overview.json": |
{{ $.Files.Get "src/dashboards/agent.json" | fromJson | toJson }} {{ $.Files.Get "src/dashboards/agent.json" | fromJson | toJson }}
{{- end }} {{- end }}

View File

@ -3,7 +3,7 @@ apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
name: meta-mimir-ruler-for-dashboards name: meta-mimir-ruler-for-dashboards
namespace: meta namespace: {{ $.Release.Namespace }}
spec: spec:
progressDeadlineSeconds: 600 progressDeadlineSeconds: 600
replicas: 1 replicas: 1
@ -24,7 +24,7 @@ spec:
app.kubernetes.io/component: ruler-for-dashboards app.kubernetes.io/component: ruler-for-dashboards
app.kubernetes.io/instance: meta app.kubernetes.io/instance: meta
app.kubernetes.io/name: mimir app.kubernetes.io/name: mimir
namespace: meta namespace: {{ $.Release.Namespace }}
spec: spec:
containers: containers:
- args: - args:

View File

@ -9,28 +9,28 @@ clusterName: "meta-monitoring"
# Set to true for a local version of logs, metrics or traces # Set to true for a local version of logs, metrics or traces
local: local:
logs: logs:
enabled: false enabled: true
metrics: metrics:
enabled: false enabled: true
traces: traces:
enabled: false enabled: true
minio: minio:
enabled: false # This should be set to true if any of the previous is enabled enabled: true # This should be set to true if any of the previous is enabled
# Set to true to write logs, metrics or traces to Grafana Cloud # Set to true to write logs, metrics or traces to Grafana Cloud
cloud: cloud:
logs: logs:
enabled: true enabled: false
endpoint: endpoint:
username: username:
password: password:
metrics: metrics:
enabled: true enabled: false
endpoint: endpoint:
username: username:
password: password:
traces: traces:
enabled: true enabled: false
endpoint: endpoint:
username: username:
password: password: