forked from RemoteSync/grafana-meta-monitoring-chart
Compare commits
67 Commits
add_ci
...
add_agent_
Author | SHA1 | Date | |
---|---|---|---|
|
a6462d1ac1 | ||
|
0d3f9a1416 | ||
|
8fa5b63db7 | ||
|
d7063da3d4 | ||
|
e7f28a261e | ||
|
509a32bc59 | ||
|
6bb31ad5e0 | ||
|
7724d9c928 | ||
|
13294675fe | ||
|
bf71def2f8 | ||
|
b37fa4adf5 | ||
|
18a5face81 | ||
|
5e908f796c | ||
|
17b52d572a | ||
|
6eac38d4ec | ||
|
3706c702a1 | ||
|
28b77dab17 | ||
|
9770a3e5b3 | ||
|
6cbffd6d9d | ||
|
4ae23a99d2 | ||
|
20232e9cf3 | ||
|
043a503ce7 | ||
|
39f50d8580 | ||
|
d9fc9e4f4e | ||
|
f61913d3da | ||
|
c29daab64d | ||
|
d389a9f741 | ||
|
6f5f50f901 | ||
|
efea1c5054 | ||
|
b02aee6816 | ||
|
c522e3f39e | ||
|
e3542e472d | ||
|
3a138991ff | ||
|
cd78caab48 | ||
|
f281741de9 | ||
|
381ecb2c06 | ||
|
20cdb8dcc1 | ||
|
019f2b7b1e | ||
|
1bffcac5e5 | ||
|
d23291dc91 | ||
|
a89ba944a3 | ||
|
ef05e599e6 | ||
|
a586e753da | ||
|
76908c1e9e | ||
|
bc5cdadb9f | ||
|
687c77c0f6 | ||
|
2a0b14ee45 | ||
|
7e06d611a7 | ||
|
f4934d6007 | ||
|
427764278c | ||
|
1093e91741 | ||
|
1ed196299b | ||
|
faa0015c11 | ||
|
53416e042c | ||
|
d804da13f1 | ||
|
8c0b68fe02 | ||
|
99bb8f13c2 | ||
|
26ff679cbb | ||
|
fb3e3ece1b | ||
|
7a5358b322 | ||
|
9c92e18efe | ||
|
ffe220590d | ||
|
e3708ce3fe | ||
|
3149f4df9b | ||
|
86ec586917 | ||
|
6cd12bee01 | ||
|
b042b396a2 |
66
.github/workflows/helm-ci.yml
vendored
Normal file
66
.github/workflows/helm-ci.yml
vendored
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
---
|
||||||
|
name: helm-ci
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- "charts/meta-monitoring/**"
|
||||||
|
|
||||||
|
env:
|
||||||
|
CT_CONFIGFILE: charts/meta-monitoring/ct.yaml
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
call-lint:
|
||||||
|
name: Lint Helm Chart
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout Code
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Lint Yaml
|
||||||
|
run: make helm-lint
|
||||||
|
|
||||||
|
# call-test:
|
||||||
|
# name: Test Helm Chart
|
||||||
|
# runs-on: ubuntu-latest
|
||||||
|
# steps:
|
||||||
|
# - name: Checkout
|
||||||
|
# uses: actions/checkout@v3
|
||||||
|
# with:
|
||||||
|
# fetch-depth: 0
|
||||||
|
|
||||||
|
# - name: Set up Helm
|
||||||
|
# uses: azure/setup-helm@v3
|
||||||
|
# with:
|
||||||
|
# version: v3.8.2
|
||||||
|
|
||||||
|
# # Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and
|
||||||
|
# # yamllint (https://github.com/adrienverge/yamllint) which require Python
|
||||||
|
# - name: Set up Python
|
||||||
|
# uses: actions/setup-python@v4
|
||||||
|
# with:
|
||||||
|
# python-version: 3.7
|
||||||
|
|
||||||
|
# - name: Set up chart-testing
|
||||||
|
# uses: helm/chart-testing-action@v2.4.0
|
||||||
|
|
||||||
|
# - name: Run chart-testing (list-changed)
|
||||||
|
# id: list-changed
|
||||||
|
# run: |
|
||||||
|
# changed=$(ct list-changed --config "${CT_CONFIGFILE}")
|
||||||
|
# if [[ -n "$changed" ]]; then
|
||||||
|
# echo "changed=true" >> $GITHUB_OUTPUT
|
||||||
|
# fi
|
||||||
|
|
||||||
|
# - name: Run chart-testing (lint)
|
||||||
|
# run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false
|
||||||
|
|
||||||
|
# - name: Create kind cluster
|
||||||
|
# uses: helm/kind-action@v1.8.0
|
||||||
|
# if: steps.list-changed.outputs.changed == 'true'
|
||||||
|
# with:
|
||||||
|
# config: tools/kind.config
|
||||||
|
|
||||||
|
# - name: Run chart-testing (install)
|
||||||
|
# run: |
|
||||||
|
# changed=$(ct list-changed --config "${CT_CONFIGFILE}")
|
||||||
|
# ct install --config "${CT_CONFIGFILE}"
|
10
Makefile
Normal file
10
Makefile
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Adapted from https://www.thapaliya.com/en/writings/well-documented-makefiles/
|
||||||
|
.PHONY: help
|
||||||
|
help: ## Display this help and any documented user-facing targets. Other undocumented targets may be present in the Makefile.
|
||||||
|
help:
|
||||||
|
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make <target>\n\nTargets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf " %-45s %s\n", $$1, $$2 }' $(MAKEFILE_LIST)
|
||||||
|
|
||||||
|
.PHONY: helm-lint
|
||||||
|
|
||||||
|
helm-lint: ## Run helm linter
|
||||||
|
$(MAKE) -BC charts/meta-monitoring lint
|
@@ -1,18 +1,18 @@
|
|||||||
dependencies:
|
dependencies:
|
||||||
- name: loki
|
- name: loki
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: 5.8.0
|
version: 5.47.2
|
||||||
- name: grafana-agent
|
- name: grafana-agent
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: 0.15.0
|
version: 0.37.0
|
||||||
- name: mimir-distributed
|
- name: mimir-distributed
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: 4.4.1
|
version: 5.2.0
|
||||||
- name: tempo-distributed
|
- name: tempo-distributed
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: 1.4.7
|
version: 1.9.1
|
||||||
- name: minio
|
- name: minio
|
||||||
repository: https://charts.min.io
|
repository: https://charts.min.io
|
||||||
version: 5.0.11
|
version: 5.0.11
|
||||||
digest: sha256:4b04084e6fe821c4d481017b2430f7c8cd782a5d60830dd3a24eb8f10a9ece09
|
digest: sha256:7b7e62e08d9a56e63fdb12ce3fd4d1fda4887545546ac3e98c7886be714fd763
|
||||||
generated: "2023-06-29T14:25:07.247853+01:00"
|
generated: "2024-04-02T15:09:13.121195+01:00"
|
||||||
|
@@ -25,21 +25,21 @@ appVersion: "0.0.1"
|
|||||||
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- name: loki
|
- name: loki
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: "5.8.0"
|
version: "5.47.2"
|
||||||
condition: local.logs.enabled
|
condition: local.logs.enabled
|
||||||
- name: grafana-agent
|
- name: grafana-agent
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: "0.15.0"
|
version: "0.37.0"
|
||||||
- name: mimir-distributed
|
- name: mimir-distributed
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: "4.4.1"
|
version: "5.2.0"
|
||||||
condition: local.metrics.enabled
|
condition: local.metrics.enabled
|
||||||
- name: tempo-distributed
|
- name: tempo-distributed
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: "1.4.7"
|
version: "1.9.1"
|
||||||
condition: local.traces.enabled
|
condition: local.traces.enabled
|
||||||
- name: minio
|
- name: minio
|
||||||
repository: https://charts.min.io
|
repository: https://charts.min.io
|
||||||
version: "5.0.11"
|
version: "5.0.11"
|
||||||
condition: local.minio.enabled
|
condition: local.minio.enabled
|
||||||
|
7
charts/meta-monitoring/Makefile
Normal file
7
charts/meta-monitoring/Makefile
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
.DEFAULT_GOAL := lint
|
||||||
|
.PHONY: lint lint-yaml
|
||||||
|
|
||||||
|
lint: lint-yaml
|
||||||
|
|
||||||
|
lint-yaml:
|
||||||
|
yamllint -c $(CURDIR)/src/.yamllint.yaml $(CURDIR)/src
|
Binary file not shown.
BIN
charts/meta-monitoring/charts/grafana-agent-0.37.0.tgz
Normal file
BIN
charts/meta-monitoring/charts/grafana-agent-0.37.0.tgz
Normal file
Binary file not shown.
BIN
charts/meta-monitoring/charts/loki-5.47.2.tgz
Normal file
BIN
charts/meta-monitoring/charts/loki-5.47.2.tgz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/mimir-distributed-5.2.0.tgz
Normal file
BIN
charts/meta-monitoring/charts/mimir-distributed-5.2.0.tgz
Normal file
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/tempo-distributed-1.9.1.tgz
Normal file
BIN
charts/meta-monitoring/charts/tempo-distributed-1.9.1.tgz
Normal file
Binary file not shown.
11
charts/meta-monitoring/ct.yaml
Normal file
11
charts/meta-monitoring/ct.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
remote: origin
|
||||||
|
target-branch: main
|
||||||
|
chart-dirs:
|
||||||
|
- charts
|
||||||
|
chart-repos:
|
||||||
|
- grafana=https://grafana.github.io/helm-charts
|
||||||
|
- minio=https://charts.min.io
|
||||||
|
helm-extra-args: --timeout 1200s
|
||||||
|
check-version-increment: false
|
||||||
|
validate-maintainers: false
|
4
charts/meta-monitoring/src/.yamllint.yaml
Normal file
4
charts/meta-monitoring/src/.yamllint.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
rules:
|
||||||
|
quoted-strings:
|
||||||
|
required: true
|
@@ -1,53 +1,53 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: loki_rules
|
- name: "loki_rules"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:loki_request_duration_seconds:99quantile
|
record: "cluster_job:loki_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:loki_request_duration_seconds:50quantile
|
record: "cluster_job:loki_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[5m]))
|
||||||
by (cluster, job)
|
by (cluster, job)"
|
||||||
record: cluster_job:loki_request_duration_seconds:avg
|
record: "cluster_job:loki_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job)"
|
||||||
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job:loki_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||||
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
|
record: "cluster_job:loki_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
|
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:loki_request_duration_seconds_count:sum_rate
|
record: "cluster_job:loki_request_duration_seconds_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, route))
|
by (le, cluster, job, route))"
|
||||||
record: cluster_job_route:loki_request_duration_seconds:99quantile
|
record: "cluster_job_route:loki_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, route))
|
by (le, cluster, job, route))"
|
||||||
record: cluster_job_route:loki_request_duration_seconds:50quantile
|
record: "cluster_job_route:loki_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job, route)
|
||||||
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
/ sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||||
record: cluster_job_route:loki_request_duration_seconds:avg
|
record: "cluster_job_route:loki_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job,
|
||||||
route)
|
route)"
|
||||||
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job_route:loki_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job, route)"
|
||||||
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
|
record: "cluster_job_route:loki_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||||
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
|
record: "cluster_job_route:loki_request_duration_seconds_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, namespace, job, route))
|
by (le, cluster, namespace, job, route))"
|
||||||
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
|
record: "cluster_namespace_job_route:loki_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, namespace, job, route))
|
by (le, cluster, namespace, job, route))"
|
||||||
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
|
record: "cluster_namespace_job_route:loki_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||||
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
|
job, route) / sum(rate(loki_request_duration_seconds_count[5m])) by (cluster,
|
||||||
namespace, job, route)
|
namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
|
record: "cluster_namespace_job_route:loki_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, namespace,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
record: "cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
record: "cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
|
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, namespace,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
record: "cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate"
|
||||||
|
@@ -1,304 +1,299 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: mimir_api_1
|
- name: "mimir_api_1"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_request_duration_seconds:99quantile
|
record: "cluster_job:cortex_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_request_duration_seconds:50quantile
|
record: "cluster_job:cortex_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[5m]))
|
||||||
by (cluster, job)
|
by (cluster, job)"
|
||||||
record: cluster_job:cortex_request_duration_seconds:avg
|
record: "cluster_job:cortex_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job)"
|
||||||
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
|
record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
|
record: "cluster_job:cortex_request_duration_seconds_count:sum_rate"
|
||||||
- name: mimir_api_2
|
- name: "mimir_api_2"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, route))
|
by (le, cluster, job, route))"
|
||||||
record: cluster_job_route:cortex_request_duration_seconds:99quantile
|
record: "cluster_job_route:cortex_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, route))
|
by (le, cluster, job, route))"
|
||||||
record: cluster_job_route:cortex_request_duration_seconds:50quantile
|
record: "cluster_job_route:cortex_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)
|
||||||
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
/ sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||||
record: cluster_job_route:cortex_request_duration_seconds:avg
|
record: "cluster_job_route:cortex_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job,
|
||||||
route)
|
route)"
|
||||||
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)"
|
||||||
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
|
record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||||
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
|
record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate"
|
||||||
- name: mimir_api_3
|
- name: "mimir_api_3"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, namespace, job, route))
|
by (le, cluster, namespace, job, route))"
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, namespace, job, route))
|
by (le, cluster, namespace, job, route))"
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||||
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
|
job, route) / sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster,
|
||||||
namespace, job, route)
|
namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, namespace,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
|
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, namespace,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate"
|
||||||
- name: mimir_querier_api
|
- name: "mimir_querier_api"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_querier_request_duration_seconds:99quantile
|
record: "cluster_job:cortex_querier_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_querier_request_duration_seconds:50quantile
|
record: "cluster_job:cortex_querier_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
job) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_querier_request_duration_seconds:avg
|
record: "cluster_job:cortex_querier_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate
|
record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate
|
record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, route))
|
by (le, cluster, job, route))"
|
||||||
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, route))
|
by (le, cluster, job, route))"
|
||||||
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
|
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by
|
||||||
(cluster, job, route)
|
(cluster, job, route)"
|
||||||
record: cluster_job_route:cortex_querier_request_duration_seconds:avg
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||||
job, route)
|
job, route)"
|
||||||
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, namespace, job, route))
|
by (le, cluster, namespace, job, route))"
|
||||||
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, namespace, job, route))
|
by (le, cluster, namespace, job, route))"
|
||||||
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
|
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m]))
|
||||||
by (cluster, namespace, job, route)
|
by (cluster, namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
namespace, job, route)
|
namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
namespace, job, route)
|
namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||||
namespace, job, route)
|
namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||||
- name: mimir_cache
|
- name: "mimir_cache"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, method))
|
by (le, cluster, job, method))"
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, method))
|
by (le, cluster, job, method))"
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
|
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[5m]))
|
||||||
by (cluster, job, method)
|
by (cluster, job, method)"
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
job, method)
|
job, method)"
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
job, method)
|
job, method)"
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_count[5m])) by (cluster,
|
||||||
job, method)
|
job, method)"
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
|
record: "cluster_job:cortex_cache_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
|
record: "cluster_job:cortex_cache_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)
|
||||||
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
/ sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds:avg
|
record: "cluster_job:cortex_cache_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
|
record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
|
record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, method))
|
by (le, cluster, job, method))"
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job, method))
|
by (le, cluster, job, method))"
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job,
|
||||||
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
method) / sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||||
job, method)
|
job, method)"
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
job, method)
|
job, method)"
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job,
|
||||||
method)
|
method)"
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||||
job, method)
|
job, method)"
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate"
|
||||||
- name: mimir_storage
|
- name: "mimir_storage"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
|
record: "cluster_job:cortex_kv_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
|
record: "cluster_job:cortex_kv_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)
|
||||||
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
/ sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds:avg
|
record: "cluster_job:cortex_kv_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
|
record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
|
record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
|
record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate"
|
||||||
- name: mimir_queries
|
- name: "mimir_queries"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_query_frontend_retries:99quantile
|
record: "cluster_job:cortex_query_frontend_retries:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_query_frontend_retries:50quantile
|
record: "cluster_job:cortex_query_frontend_retries:50quantile"
|
||||||
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
|
- expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[5m]))
|
||||||
by (cluster, job)
|
by (cluster, job)"
|
||||||
record: cluster_job:cortex_query_frontend_retries:avg
|
record: "cluster_job:cortex_query_frontend_retries:avg"
|
||||||
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
|
- expr: "sum(rate(cortex_query_frontend_retries_bucket[5m])) by (le, cluster, job)"
|
||||||
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
|
record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
|
record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_query_frontend_retries_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
|
record: "cluster_job:cortex_query_frontend_retries_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster,
|
||||||
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
|
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by
|
||||||
(cluster, job)
|
(cluster, job)"
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg"
|
||||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m])) by (le,
|
||||||
cluster, job)
|
cluster, job)"
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by (cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate"
|
||||||
- name: mimir_ingester_queries
|
- name: "mimir_ingester_queries"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_ingester_queried_series:99quantile
|
record: "cluster_job:cortex_ingester_queried_series:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_ingester_queried_series:50quantile
|
record: "cluster_job:cortex_ingester_queried_series:50quantile"
|
||||||
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
|
- expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[5m]))
|
||||||
by (cluster, job)
|
by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_series:avg
|
record: "cluster_job:cortex_ingester_queried_series:avg"
|
||||||
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
|
- expr: "sum(rate(cortex_ingester_queried_series_bucket[5m])) by (le, cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
|
record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
|
record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_ingester_queried_series_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
|
record: "cluster_job:cortex_ingester_queried_series_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_ingester_queried_samples:99quantile
|
record: "cluster_job:cortex_ingester_queried_samples:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_ingester_queried_samples:50quantile
|
record: "cluster_job:cortex_ingester_queried_samples:50quantile"
|
||||||
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
|
- expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[5m]))
|
||||||
by (cluster, job)
|
by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_samples:avg
|
record: "cluster_job:cortex_ingester_queried_samples:avg"
|
||||||
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
|
- expr: "sum(rate(cortex_ingester_queried_samples_bucket[5m])) by (le, cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
|
record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate
|
record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_ingester_queried_samples_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate
|
record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate"
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_ingester_queried_exemplars:99quantile
|
record: "cluster_job:cortex_ingester_queried_exemplars:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[5m]))
|
||||||
by (le, cluster, job))
|
by (le, cluster, job))"
|
||||||
record: cluster_job:cortex_ingester_queried_exemplars:50quantile
|
record: "cluster_job:cortex_ingester_queried_exemplars:50quantile"
|
||||||
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job) /
|
||||||
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
|
sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_exemplars:avg
|
record: "cluster_job:cortex_ingester_queried_exemplars:avg"
|
||||||
- expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[5m])) by (le, cluster,
|
||||||
job)
|
job)"
|
||||||
record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate
|
record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate"
|
||||||
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate
|
record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate"
|
||||||
- expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)"
|
||||||
record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate
|
record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate"
|
||||||
- name: mimir_received_samples
|
- name: "mimir_received_samples"
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))"
|
||||||
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
|
record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m"
|
||||||
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
- name: "mimir_exemplars_in"
|
||||||
- name: mimir_exemplars_in
|
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))"
|
||||||
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))
|
record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m"
|
||||||
record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m
|
- name: "mimir_received_exemplars"
|
||||||
- name: mimir_received_exemplars
|
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))"
|
||||||
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))
|
record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m"
|
||||||
record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m
|
- name: "mimir_exemplars_ingested"
|
||||||
- name: mimir_exemplars_ingested
|
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))"
|
||||||
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))
|
record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m"
|
||||||
record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m
|
- name: "mimir_exemplars_appended"
|
||||||
- name: mimir_exemplars_appended
|
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))"
|
||||||
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))
|
record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m"
|
||||||
record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m
|
- name: "mimir_scaling_rules"
|
||||||
- name: mimir_scaling_rules
|
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: |
|
||||||
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
|
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
|
||||||
@@ -315,7 +310,7 @@ groups:
|
|||||||
sum by (cluster, namespace, deployment) (
|
sum by (cluster, namespace, deployment) (
|
||||||
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
|
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
|
||||||
)
|
)
|
||||||
record: cluster_namespace_deployment:actual_replicas:count
|
record: "cluster_namespace_deployment:actual_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
ceil(
|
ceil(
|
||||||
quantile_over_time(0.99,
|
quantile_over_time(0.99,
|
||||||
@@ -326,18 +321,18 @@ groups:
|
|||||||
/ 240000
|
/ 240000
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
deployment: distributor
|
deployment: "distributor"
|
||||||
reason: sample_rate
|
reason: "sample_rate"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
ceil(
|
ceil(
|
||||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||||
* 0.59999999999999998 / 240000
|
* 0.59999999999999998 / 240000
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
deployment: distributor
|
deployment: "distributor"
|
||||||
reason: sample_rate_limits
|
reason: "sample_rate_limits"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
ceil(
|
ceil(
|
||||||
quantile_over_time(0.99,
|
quantile_over_time(0.99,
|
||||||
@@ -348,9 +343,9 @@ groups:
|
|||||||
* 3 / 80000
|
* 3 / 80000
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
deployment: ingester
|
deployment: "ingester"
|
||||||
reason: sample_rate
|
reason: "sample_rate"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
ceil(
|
ceil(
|
||||||
quantile_over_time(0.99,
|
quantile_over_time(0.99,
|
||||||
@@ -361,27 +356,27 @@ groups:
|
|||||||
/ 1500000
|
/ 1500000
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
deployment: ingester
|
deployment: "ingester"
|
||||||
reason: active_series
|
reason: "active_series"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
ceil(
|
ceil(
|
||||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
|
||||||
* 3 * 0.59999999999999998 / 1500000
|
* 3 * 0.59999999999999998 / 1500000
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
deployment: ingester
|
deployment: "ingester"
|
||||||
reason: active_series_limits
|
reason: "active_series_limits"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
ceil(
|
ceil(
|
||||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||||
* 0.59999999999999998 / 80000
|
* 0.59999999999999998 / 80000
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
deployment: ingester
|
deployment: "ingester"
|
||||||
reason: sample_rate_limits
|
reason: "sample_rate_limits"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
ceil(
|
ceil(
|
||||||
(sum by (cluster, namespace) (
|
(sum by (cluster, namespace) (
|
||||||
@@ -393,14 +388,14 @@ groups:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
deployment: memcached
|
deployment: "memcached"
|
||||||
reason: active_series
|
reason: "active_series"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
sum by (cluster, namespace, deployment) (
|
sum by (cluster, namespace, deployment) (
|
||||||
label_replace(
|
label_replace(
|
||||||
label_replace(
|
label_replace(
|
||||||
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
|
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[5m])),
|
||||||
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
),
|
),
|
||||||
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
@@ -408,7 +403,7 @@ groups:
|
|||||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate
|
record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate"
|
||||||
- expr: |
|
- expr: |
|
||||||
# Convenience rule to get the CPU request for both a deployment and a statefulset.
|
# Convenience rule to get the CPU request for both a deployment and a statefulset.
|
||||||
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
@@ -448,7 +443,7 @@ groups:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum"
|
||||||
- expr: |
|
- expr: |
|
||||||
# Jobs should be sized to their CPU usage.
|
# Jobs should be sized to their CPU usage.
|
||||||
# We do this by comparing 99th percentile usage over the last 24hrs to
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||||
@@ -461,8 +456,8 @@ groups:
|
|||||||
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
reason: cpu_usage
|
reason: "cpu_usage"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- expr: |
|
- expr: |
|
||||||
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
|
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
|
||||||
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
@@ -477,7 +472,7 @@ groups:
|
|||||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
record: cluster_namespace_deployment:container_memory_usage_bytes:sum
|
record: "cluster_namespace_deployment:container_memory_usage_bytes:sum"
|
||||||
- expr: |
|
- expr: |
|
||||||
# Convenience rule to get the Memory request for both a deployment and a statefulset.
|
# Convenience rule to get the Memory request for both a deployment and a statefulset.
|
||||||
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
@@ -517,7 +512,7 @@ groups:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum"
|
||||||
- expr: |
|
- expr: |
|
||||||
# Jobs should be sized to their Memory usage.
|
# Jobs should be sized to their Memory usage.
|
||||||
# We do this by comparing 99th percentile usage over the last 24hrs to
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||||
@@ -530,42 +525,31 @@ groups:
|
|||||||
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
||||||
)
|
)
|
||||||
labels:
|
labels:
|
||||||
reason: memory_usage
|
reason: "memory_usage"
|
||||||
record: cluster_namespace_deployment_reason:required_replicas:count
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
- name: mimir_alertmanager_rules
|
- name: "mimir_alertmanager_rules"
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: "sum by (cluster, job, pod) (cortex_alertmanager_alerts)"
|
||||||
sum by (cluster, job, pod) (cortex_alertmanager_alerts)
|
record: "cluster_job_pod:cortex_alertmanager_alerts:sum"
|
||||||
record: cluster_job_pod:cortex_alertmanager_alerts:sum
|
- expr: "sum by (cluster, job, pod) (cortex_alertmanager_silences)"
|
||||||
- expr: |
|
record: "cluster_job_pod:cortex_alertmanager_silences:sum"
|
||||||
sum by (cluster, job, pod) (cortex_alertmanager_silences)
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))"
|
||||||
record: cluster_job_pod:cortex_alertmanager_silences:sum
|
record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m"
|
||||||
- expr: |
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))"
|
||||||
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
|
record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m"
|
||||||
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m
|
- expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))"
|
||||||
- expr: |
|
record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m"
|
||||||
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
|
- expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))"
|
||||||
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m
|
record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m"
|
||||||
- expr: |
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))"
|
||||||
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
|
record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m"
|
||||||
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))"
|
||||||
- expr: |
|
record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m"
|
||||||
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))"
|
||||||
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m
|
record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m"
|
||||||
- expr: |
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))"
|
||||||
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
|
record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m"
|
||||||
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m
|
- name: "mimir_ingester_rules"
|
||||||
- expr: |
|
|
||||||
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
|
|
||||||
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m
|
|
||||||
- expr: |
|
|
||||||
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
|
|
||||||
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m
|
|
||||||
- expr: |
|
|
||||||
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
|
|
||||||
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
|
|
||||||
- name: mimir_ingester_rules
|
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: "sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[5m]))"
|
||||||
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
|
record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m"
|
||||||
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
|
|
||||||
|
@@ -1,15 +1,15 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: tempo_rules
|
- name: "tempo_rules"
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
- expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))"
|
||||||
record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile"
|
||||||
- expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
- expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))"
|
||||||
record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile"
|
||||||
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
- expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:tempo_request_duration_seconds:avg
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg"
|
||||||
- expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)
|
- expr: "sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate"
|
||||||
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)
|
- expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate"
|
||||||
- expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
- expr: "sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)"
|
||||||
record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"
|
||||||
|
33
charts/meta-monitoring/templates/_helpers.tpl
Normal file
33
charts/meta-monitoring/templates/_helpers.tpl
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
{{/*
|
||||||
|
Return the appropriate apiVersion for ingress.
|
||||||
|
*/}}
|
||||||
|
{{- define "ingress.apiVersion" -}}
|
||||||
|
{{- if and (.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19-0" .Capabilities.KubeVersion.Version) -}}
|
||||||
|
{{- print "networking.k8s.io/v1" -}}
|
||||||
|
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" -}}
|
||||||
|
{{- print "networking.k8s.io/v1beta1" -}}
|
||||||
|
{{- else -}}
|
||||||
|
{{- print "extensions/v1beta1" -}}
|
||||||
|
{{- end -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Return if ingress is stable.
|
||||||
|
*/}}
|
||||||
|
{{- define "ingress.isStable" -}}
|
||||||
|
{{- eq (include "ingress.apiVersion" .) "networking.k8s.io/v1" -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Return if ingress supports ingressClassName.
|
||||||
|
*/}}
|
||||||
|
{{- define "ingress.supportsIngressClassName" -}}
|
||||||
|
{{- or (eq (include "ingress.isStable" .) "true") (and (eq (include "ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Return if ingress supports pathType.
|
||||||
|
*/}}
|
||||||
|
{{- define "ingress.supportsPathType" -}}
|
||||||
|
{{- or (eq (include "ingress.isStable" .) "true") (and (eq (include "ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) -}}
|
||||||
|
{{- end -}}
|
@@ -18,10 +18,10 @@
|
|||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
{{- define "agent.loki_process_targets" -}}
|
{{- define "agent.loki_process_targets" -}}
|
||||||
{{- if empty .Values.logs.piiRegexes }}
|
{{- if and (empty .Values.logs.piiRegexes) (empty .Values.logs.retain) }}
|
||||||
{{- include "agent.loki_write_targets" . }}
|
{{- include "agent.loki_write_targets" . }}
|
||||||
{{- else }}
|
{{- else }}
|
||||||
{{- printf "loki.process.PII.receiver" }}
|
{{- printf "loki.process.filter.receiver" }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
@@ -8,7 +8,7 @@ data:
|
|||||||
discovery.kubernetes "pods" {
|
discovery.kubernetes "pods" {
|
||||||
role = "pod"
|
role = "pod"
|
||||||
namespaces {
|
namespaces {
|
||||||
own_namespace = false
|
own_namespace = true
|
||||||
names = [ {{ include "agent.namespaces" . }} ]
|
names = [ {{ include "agent.namespaces" . }} ]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -33,22 +33,38 @@ data:
|
|||||||
}
|
}
|
||||||
rule {
|
rule {
|
||||||
target_label = "cluster"
|
target_label = "cluster"
|
||||||
replacement = "{{- .Values.clusterName -}}"
|
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
||||||
// Logs
|
// Logs
|
||||||
|
|
||||||
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
remote.kubernetes.secret "logs_credentials" {
|
||||||
|
namespace = "{{- $.Release.Namespace -}}"
|
||||||
|
name = "{{- .Values.cloud.logs.secret -}}"
|
||||||
|
}
|
||||||
|
|
||||||
loki.source.kubernetes "pods" {
|
loki.source.kubernetes "pods" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
targets = discovery.relabel.rename_meta_labels.output
|
targets = discovery.relabel.rename_meta_labels.output
|
||||||
forward_to = [ {{ include "agent.loki_process_targets" . }} ]
|
forward_to = [ {{ include "agent.loki_process_targets" . }} ]
|
||||||
}
|
}
|
||||||
|
|
||||||
{{- if not (empty .Values.logs.piiRegexes) }}
|
{{- if or (not (empty .Values.logs.retain)) (not (empty .Values.logs.piiRegexes)) }}
|
||||||
loki.process "PII" {
|
loki.process "filter" {
|
||||||
forward_to = [ {{ include "agent.loki_write_targets" . }} ]
|
forward_to = [ {{ include "agent.loki_write_targets" . }} ]
|
||||||
|
|
||||||
|
{{- if not (empty .Values.logs.retain) }}
|
||||||
|
stage.match {
|
||||||
|
selector = "{cluster=\"{{- .Values.clusterLabelValue -}}\", namespace=~\"{{- join "|" .Values.namespacesToMonitor -}}|{{- $.Release.Namespace -}}\", pod=~\"loki.*\"} !~ \"{{ join "|" .Values.logs.retain }}\""
|
||||||
|
action = "drop"
|
||||||
|
}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if not (empty .Values.logs.piiRegexes) }}
|
||||||
{{- range .Values.logs.piiRegexes }}
|
{{- range .Values.logs.piiRegexes }}
|
||||||
stage.replace {
|
stage.replace {
|
||||||
expression = "{{ .expression }}"
|
expression = "{{ .expression }}"
|
||||||
@@ -56,26 +72,85 @@ data:
|
|||||||
replace = "{{ .replace }}"
|
replace = "{{ .replace }}"
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
||||||
// Metrics
|
// Metrics
|
||||||
|
|
||||||
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
remote.kubernetes.secret "metrics_credentials" {
|
||||||
|
namespace = "{{- $.Release.Namespace -}}"
|
||||||
|
name = "{{- .Values.cloud.metrics.secret -}}"
|
||||||
|
}
|
||||||
|
|
||||||
|
discovery.kubernetes "metric_pods" {
|
||||||
|
role = "pod"
|
||||||
|
namespaces {
|
||||||
|
own_namespace = true
|
||||||
|
names = [ {{ include "agent.namespaces" . }} ]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
discovery.relabel "only_http_metrics" {
|
||||||
|
targets = discovery.kubernetes.metric_pods.targets
|
||||||
|
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||||
|
separator = "/"
|
||||||
|
regex = "(.*)/(.*)/(.*)"
|
||||||
|
replacement = "${1}/${2}-${3}"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
target_label = "cluster"
|
||||||
|
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_container_port_number"]
|
||||||
|
action = "drop"
|
||||||
|
regex = "9095"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
prometheus.scrape "pods" {
|
prometheus.scrape "pods" {
|
||||||
targets = discovery.relabel.rename_meta_labels.output
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
targets = discovery.relabel.only_http_metrics.output
|
||||||
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.relabel "filter" {
|
||||||
|
rule {
|
||||||
|
source_labels = ["__name__"]
|
||||||
|
regex = "({{ join "|" .Values.metrics.retain }})"
|
||||||
|
action = "keep"
|
||||||
|
}
|
||||||
|
|
||||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
}
|
}
|
||||||
{{- if .Values.kubeStateMetrics.enabled }}
|
{{- if .Values.kubeStateMetrics.enabled }}
|
||||||
|
|
||||||
prometheus.scrape "kubeStateMetrics" {
|
prometheus.scrape "kubeStateMetrics" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
targets = [ { "__address__" = "{{ .Values.kubeStateMetrics.endpoint }}" } ]
|
targets = [ { "__address__" = "{{ .Values.kubeStateMetrics.endpoint }}" } ]
|
||||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
// cAdvisor and Kubelete metrics
|
// cAdvisor and Kubelet metrics
|
||||||
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
|
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
|
||||||
discovery.kubernetes "all_nodes" {
|
discovery.kubernetes "all_nodes" {
|
||||||
role = "node"
|
role = "node"
|
||||||
@@ -104,15 +179,17 @@ data:
|
|||||||
}
|
}
|
||||||
rule {
|
rule {
|
||||||
target_label = "cluster"
|
target_label = "cluster"
|
||||||
replacement = "{{- .Values.clusterName -}}"
|
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prometheus.scrape "cadvisor" {
|
prometheus.scrape "cadvisor" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
targets = discovery.relabel.all_nodes.output
|
targets = discovery.relabel.all_nodes.output
|
||||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
|
||||||
scrape_interval = "15s"
|
|
||||||
metrics_path = "/metrics/cadvisor"
|
metrics_path = "/metrics/cadvisor"
|
||||||
scheme = "https"
|
scheme = "https"
|
||||||
|
|
||||||
@@ -123,10 +200,12 @@ data:
|
|||||||
}
|
}
|
||||||
|
|
||||||
prometheus.scrape "kubelet" {
|
prometheus.scrape "kubelet" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
targets = discovery.relabel.all_nodes.output
|
targets = discovery.relabel.all_nodes.output
|
||||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
|
||||||
scrape_interval = "15s"
|
|
||||||
metrics_path = "/metrics"
|
metrics_path = "/metrics"
|
||||||
scheme = "https"
|
scheme = "https"
|
||||||
|
|
||||||
@@ -136,18 +215,20 @@ data:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prometheus.exporter.unix {}
|
prometheus.exporter.unix "promexporter" {}
|
||||||
|
|
||||||
prometheus.scrape "node_exporter" {
|
prometheus.scrape "node_exporter" {
|
||||||
targets = prometheus.exporter.unix.targets
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
targets = prometheus.exporter.unix.promexporter.targets
|
||||||
forward_to = [prometheus.relabel.node_exporter.receiver]
|
forward_to = [prometheus.relabel.node_exporter.receiver]
|
||||||
|
|
||||||
job_name = "node-exporter"
|
job_name = "node-exporter"
|
||||||
scrape_interval = "15s"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
prometheus.relabel "node_exporter" {
|
prometheus.relabel "node_exporter" {
|
||||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
|
||||||
rule {
|
rule {
|
||||||
replacement = env("HOSTNAME")
|
replacement = env("HOSTNAME")
|
||||||
@@ -178,14 +259,19 @@ data:
|
|||||||
}
|
}
|
||||||
rule {
|
rule {
|
||||||
target_label = "cluster"
|
target_label = "cluster"
|
||||||
replacement = "{{- .Values.clusterName -}}"
|
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
||||||
// Traces
|
// Traces
|
||||||
|
|
||||||
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
remote.kubernetes.secret "traces_credentials" {
|
||||||
|
namespace = "{{- $.Release.Namespace -}}"
|
||||||
|
name = "{{- .Values.cloud.traces.secret -}}"
|
||||||
|
}
|
||||||
|
|
||||||
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
||||||
otelcol.receiver.otlp "otlp_receiver" {
|
otelcol.receiver.otlp "otlp_receiver" {
|
||||||
// We don't technically need this, but it shows how to change listen address and incoming port.
|
// We don't technically need this, but it shows how to change listen address and incoming port.
|
||||||
@@ -254,11 +340,10 @@ data:
|
|||||||
{{- if .Values.cloud.logs.enabled }}
|
{{- if .Values.cloud.logs.enabled }}
|
||||||
loki.write "cloud" {
|
loki.write "cloud" {
|
||||||
endpoint {
|
endpoint {
|
||||||
url = "{{- .Values.cloud.logs.endpoint -}}/loki/api/v1/push"
|
url = nonsensitive(remote.kubernetes.secret.logs_credentials.data["endpoint"])
|
||||||
|
|
||||||
basic_auth {
|
basic_auth {
|
||||||
username = "{{- .Values.cloud.logs.username -}}"
|
username = nonsensitive(remote.kubernetes.secret.logs_credentials.data["username"])
|
||||||
password = "{{- .Values.cloud.logs.password -}}"
|
password = remote.kubernetes.secret.logs_credentials.data["password"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -267,11 +352,10 @@ data:
|
|||||||
{{- if .Values.cloud.metrics.enabled }}
|
{{- if .Values.cloud.metrics.enabled }}
|
||||||
prometheus.remote_write "cloud" {
|
prometheus.remote_write "cloud" {
|
||||||
endpoint {
|
endpoint {
|
||||||
url = "{{- .Values.cloud.metrics.endpoint -}}/api/prom/push"
|
url = nonsensitive(remote.kubernetes.secret.metrics_credentials.data["endpoint"])
|
||||||
|
|
||||||
basic_auth {
|
basic_auth {
|
||||||
username = "{{- .Values.cloud.metrics.username -}}"
|
username = nonsensitive(remote.kubernetes.secret.metrics_credentials.data["username"])
|
||||||
password = "{{- .Values.cloud.metrics.password -}}"
|
password = remote.kubernetes.secret.metrics_credentials.data["password"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -280,13 +364,13 @@ data:
|
|||||||
{{- if .Values.cloud.traces.enabled }}
|
{{- if .Values.cloud.traces.enabled }}
|
||||||
otelcol.exporter.otlp "cloud" {
|
otelcol.exporter.otlp "cloud" {
|
||||||
client {
|
client {
|
||||||
endpoint = "{{- .Values.cloud.traces.endpoint -}}"
|
endpoint = nonsensitive(remote.kubernetes.secret.traces_credentials.data["endpoint"])
|
||||||
auth = otelcol.auth.basic.creds.handler
|
auth = otelcol.auth.basic.creds.handler
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
otelcol.auth.basic "creds" {
|
otelcol.auth.basic "creds" {
|
||||||
username = "{{- .Values.cloud.traces.username -}}"
|
username = nonsensitive(remote.kubernetes.secret.traces_credentials.data["username"])
|
||||||
password = "{{- .Values.cloud.traces.password -}}"
|
password = remote.kubernetes.secret.traces_credentials.data["password"]
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.traces.enabled }}
|
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if or (or .Values.local.logs.enabled .Values.local.metrics.enabled) .Values.local.traces.enabled }}
|
{{- if .Values.local.grafana.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -0,0 +1,57 @@
|
|||||||
|
{{- if and .Values.local.grafana.enabled .Values.grafana.ingress.enabled -}}
|
||||||
|
{{- $ingressApiIsStable := eq (include "ingress.isStable" .) "true" -}}
|
||||||
|
{{- $ingressSupportsIngressClassName := eq (include "ingress.supportsIngressClassName" .) "true" -}}
|
||||||
|
{{- $ingressSupportsPathType := eq (include "ingress.supportsPathType" .) "true" -}}
|
||||||
|
apiVersion: {{ include "ingress.apiVersion" . }}
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
labels:
|
||||||
|
app: grafana
|
||||||
|
{{- range $labelKey, $labelValue := .Values.grafana.ingress.labels }}
|
||||||
|
{{ $labelKey }}: {{ $labelValue | toYaml }}
|
||||||
|
{{- end }}
|
||||||
|
{{- with .Values.grafana.ingress.annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
{{- if and $ingressSupportsIngressClassName .Values.grafana.ingress.ingressClassName }}
|
||||||
|
ingressClassName: {{ .Values.grafana.ingress.ingressClassName }}
|
||||||
|
{{- end -}}
|
||||||
|
{{- if .Values.grafana.ingress.tls }}
|
||||||
|
tls:
|
||||||
|
{{- range .Values.grafana.ingress.tls }}
|
||||||
|
- hosts:
|
||||||
|
{{- range .hosts }}
|
||||||
|
- {{ tpl . $ | quote }}
|
||||||
|
{{- end }}
|
||||||
|
{{- with .secretName }}
|
||||||
|
secretName: {{ . }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
rules:
|
||||||
|
{{- range .Values.grafana.ingress.hosts }}
|
||||||
|
- host: {{ tpl .host $ | quote }}
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
{{- range .paths }}
|
||||||
|
- path: {{ .path }}
|
||||||
|
{{- if $ingressSupportsPathType }}
|
||||||
|
pathType: {{ .pathType }}
|
||||||
|
{{- end }}
|
||||||
|
backend:
|
||||||
|
{{- if $ingressApiIsStable }}
|
||||||
|
service:
|
||||||
|
name: grafana
|
||||||
|
port:
|
||||||
|
number: 3000
|
||||||
|
{{- else }}
|
||||||
|
serviceName: grafana
|
||||||
|
servicePort: 3000
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if or (or .Values.local.logs.enabled .Values.local.metrics.enabled) .Values.local.traces.enabled }}
|
{{- if .Values.local.grafana.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: PersistentVolumeClaim
|
kind: PersistentVolumeClaim
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.logs.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.logs.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.metrics.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.metrics.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.metrics.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.metrics.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.metrics.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.dashboards.traces.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.traces.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
|
@@ -1,9 +1,10 @@
|
|||||||
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
{{- if .Values.local.grafana.enabled }}
|
||||||
|
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
name: meta-mimir-ruler-for-dashboards
|
name: {{ $.Release.Namespace }}-mimir-ruler-for-dashboards
|
||||||
namespace: meta
|
namespace: {{ $.Release.Namespace }}
|
||||||
spec:
|
spec:
|
||||||
progressDeadlineSeconds: 600
|
progressDeadlineSeconds: 600
|
||||||
replicas: 1
|
replicas: 1
|
||||||
@@ -24,7 +25,7 @@ spec:
|
|||||||
app.kubernetes.io/component: ruler-for-dashboards
|
app.kubernetes.io/component: ruler-for-dashboards
|
||||||
app.kubernetes.io/instance: meta
|
app.kubernetes.io/instance: meta
|
||||||
app.kubernetes.io/name: mimir
|
app.kubernetes.io/name: mimir
|
||||||
namespace: meta
|
namespace: {{ $.Release.Namespace }}
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- args:
|
- args:
|
||||||
@@ -91,8 +92,6 @@ spec:
|
|||||||
runAsUser: 10001
|
runAsUser: 10001
|
||||||
seccompProfile:
|
seccompProfile:
|
||||||
type: RuntimeDefault
|
type: RuntimeDefault
|
||||||
serviceAccount: meta-mimir
|
|
||||||
serviceAccountName: meta-mimir
|
|
||||||
terminationGracePeriodSeconds: 180
|
terminationGracePeriodSeconds: 180
|
||||||
topologySpreadConstraints:
|
topologySpreadConstraints:
|
||||||
- labelSelector:
|
- labelSelector:
|
||||||
@@ -109,11 +108,11 @@ spec:
|
|||||||
items:
|
items:
|
||||||
- key: mimir.yaml
|
- key: mimir.yaml
|
||||||
path: mimir.yaml
|
path: mimir.yaml
|
||||||
name: meta-mimir-config
|
name: {{ $.Release.Namespace }}-mimir-config
|
||||||
name: config
|
name: config
|
||||||
- configMap:
|
- configMap:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
name: meta-mimir-runtime
|
name: {{ $.Release.Namespace }}-mimir-runtime
|
||||||
name: runtime-config
|
name: runtime-config
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
name: storage
|
name: storage
|
||||||
@@ -124,3 +123,4 @@ spec:
|
|||||||
name: rules
|
name: rules
|
||||||
name: rules
|
name: rules
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
{{- if .Values.local.metrics.enabled }}
|
||||||
|
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
@@ -16,3 +17,4 @@ data:
|
|||||||
{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }}
|
{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
@@ -3,20 +3,20 @@
|
|||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
{{- if eq .Values.cloud.logs.enabled true -}}
|
{{- if eq .Values.cloud.logs.enabled true -}}
|
||||||
{{- if or (empty .Values.cloud.logs.endpoint) (or (empty .Values.cloud.logs.username) (empty .Values.cloud.logs.password)) -}}
|
{{- if empty .Values.cloud.logs.secret -}}
|
||||||
{{- fail "if cloud.logs is enabled then the endpoint, username and password have to be filled in" -}}
|
{{- fail "if cloud.logs is enabled then the secret has to be filled in" -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
{{- if eq .Values.cloud.metrics.enabled true -}}
|
{{- if eq .Values.cloud.metrics.enabled true -}}
|
||||||
{{- if or (empty .Values.cloud.metrics.endpoint) (or (empty .Values.cloud.metrics.username) (empty .Values.cloud.metrics.password)) -}}
|
{{- if empty .Values.cloud.metrics.secret -}}
|
||||||
{{- fail "if cloud.metrics is enabled then the endpoint, username and password have to be filled in" -}}
|
{{- fail "if cloud.metrics is enabled then the secret has to be filled in" -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
{{- if eq .Values.cloud.traces.enabled true -}}
|
{{- if eq .Values.cloud.traces.enabled true -}}
|
||||||
{{- if or (empty .Values.cloud.traces.endpoint) (or (empty .Values.cloud.traces.username) (empty .Values.cloud.traces.password)) -}}
|
{{- if empty .Values.cloud.traces.secret -}}
|
||||||
{{- fail "if cloud.traces is enabled then the endpoint, username and password have to be filled in" -}}
|
{{- fail "if cloud.traces is enabled then the secret has to be filled in" -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
@@ -37,3 +37,7 @@
|
|||||||
{{- if empty .Values.namespacesToMonitor -}}
|
{{- if empty .Values.namespacesToMonitor -}}
|
||||||
{{- fail "No namespaces have been specified in namespacesToMonitor" -}}
|
{{- fail "No namespaces have been specified in namespacesToMonitor" -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
|
{{- if empty .Values.metrics.retain -}}
|
||||||
|
{{- fail "All metrics will be collected, please specify some in metrics.retain" -}}
|
||||||
|
{{- end -}}
|
||||||
|
@@ -4,10 +4,24 @@ namespacesToMonitor:
|
|||||||
- mimir
|
- mimir
|
||||||
- tempo
|
- tempo
|
||||||
# The name of the cluster where this will be installed
|
# The name of the cluster where this will be installed
|
||||||
clusterName: "meta-monitoring"
|
clusterLabelValue: "meta-monitoring"
|
||||||
|
|
||||||
|
# Set to true to write logs, metrics or traces to Grafana Cloud
|
||||||
|
cloud:
|
||||||
|
logs:
|
||||||
|
enabled: true
|
||||||
|
secret: "logs"
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
secret: "metrics"
|
||||||
|
traces:
|
||||||
|
enabled: true
|
||||||
|
secret: "traces"
|
||||||
|
|
||||||
# Set to true for a local version of logs, metrics or traces
|
# Set to true for a local version of logs, metrics or traces
|
||||||
local:
|
local:
|
||||||
|
grafana:
|
||||||
|
enabled: false
|
||||||
logs:
|
logs:
|
||||||
enabled: false
|
enabled: false
|
||||||
metrics:
|
metrics:
|
||||||
@@ -17,33 +31,132 @@ local:
|
|||||||
minio:
|
minio:
|
||||||
enabled: false # This should be set to true if any of the previous is enabled
|
enabled: false # This should be set to true if any of the previous is enabled
|
||||||
|
|
||||||
# Set to true to write logs, metrics or traces to Grafana Cloud
|
grafana:
|
||||||
cloud:
|
# Gateway ingress configuration
|
||||||
logs:
|
ingress:
|
||||||
|
# -- Specifies whether an ingress for the gateway should be created
|
||||||
enabled: true
|
enabled: true
|
||||||
endpoint:
|
# -- Ingress Class Name. MAY be required for Kubernetes versions >= 1.18
|
||||||
username:
|
ingressClassName: ""
|
||||||
password:
|
# -- Annotations for the gateway ingress
|
||||||
metrics:
|
annotations: { }
|
||||||
enabled: true
|
# -- Labels for the gateway ingress
|
||||||
endpoint:
|
labels: { }
|
||||||
username:
|
# -- Hosts configuration for the gateway ingress, passed through the `tpl` function to allow templating
|
||||||
password:
|
hosts:
|
||||||
traces:
|
- host: monitoring.example.com
|
||||||
enabled: true
|
paths:
|
||||||
endpoint:
|
- path: /
|
||||||
username:
|
# -- pathType (e.g. ImplementationSpecific, Prefix, .. etc.) might also be required by some Ingress Controllers
|
||||||
password:
|
# pathType: Prefix
|
||||||
|
# -- TLS configuration for the gateway ingress. Hosts passed through the `tpl` function to allow templating
|
||||||
|
#tls:
|
||||||
|
# - secretName: grafana-tls
|
||||||
|
# hosts:
|
||||||
|
# - monitoring.example.com
|
||||||
|
|
||||||
|
|
||||||
# Adding regexes here will add a stage.replace block for logs. For more information see
|
|
||||||
# https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block
|
|
||||||
logs:
|
logs:
|
||||||
|
# Adding regexes here will add a stage.replace block for logs. For more information see
|
||||||
|
# https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block
|
||||||
piiRegexes:
|
piiRegexes:
|
||||||
# This example replaces the word after password with *****
|
# This example replaces the word after password with *****
|
||||||
# - expression: "password (\\\\S+)"
|
# - expression: "password (\\\\S+)"
|
||||||
# source: "" # Empty uses the log message
|
# source: "" # Empty uses the log message
|
||||||
# replace: "*****""
|
# replace: "*****""
|
||||||
|
|
||||||
|
# The lines matching these will be kept in Loki
|
||||||
|
retain:
|
||||||
|
# This shows the queries
|
||||||
|
- caller=metrics.go
|
||||||
|
# This shows any errors
|
||||||
|
- level=error
|
||||||
|
# This shows the ingest requests and is very noisy. Uncomment to include.
|
||||||
|
# - caller=push.go
|
||||||
|
# Log lines for delete requests
|
||||||
|
- delete request for user added
|
||||||
|
- Started processing delete request
|
||||||
|
- delete request for user marked as processed
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
# The list of metrics to retain for logging dashboards
|
||||||
|
retain:
|
||||||
|
- agent_config_last_load_success_timestamp_seconds
|
||||||
|
- agent_config_last_load_successful
|
||||||
|
- agent_config_load_failures_total
|
||||||
|
- container_cpu_usage_seconds_total
|
||||||
|
- container_fs_writes_bytes_total
|
||||||
|
- container_memory_working_set_bytes
|
||||||
|
- container_network_receive_bytes_total
|
||||||
|
- container_network_transmit_bytes_total
|
||||||
|
- container_spec_cpu_period
|
||||||
|
- container_spec_cpu_quota
|
||||||
|
- container_spec_memory_limit_bytes
|
||||||
|
- cortex_ingester_flush_queue_length
|
||||||
|
- go_gc_duration_seconds
|
||||||
|
- go_goroutines
|
||||||
|
- go_memstats_heap_inuse_bytes
|
||||||
|
- kubelet_volume_stats_used_bytes
|
||||||
|
- kubelet_volume_stats_capacity_bytes
|
||||||
|
- kube_persistentvolumeclaim_labels
|
||||||
|
- kube_pod_container_resource_requests
|
||||||
|
- kube_pod_container_status_last_terminated_reason
|
||||||
|
- kube_pod_container_status_restarts_total
|
||||||
|
- loki_boltdb_shipper_compact_tables_operation_duration_seconds
|
||||||
|
- loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds
|
||||||
|
- loki_boltdb_shipper_retention_marker_count_total
|
||||||
|
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_bucket
|
||||||
|
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_count
|
||||||
|
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_sum
|
||||||
|
- loki_boltdb_shipper_retention_marker_table_processed_total
|
||||||
|
- loki_boltdb_shipper_request_duration_seconds_bucket
|
||||||
|
- loki_boltdb_shipper_request_duration_seconds_count
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_count
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_sum
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_marker_files_current
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_marker_file_processing_current_time
|
||||||
|
- loki_build_info
|
||||||
|
- loki_chunk_store_index_entries_per_chunk_count
|
||||||
|
- loki_chunk_store_index_entries_per_chunk_sum
|
||||||
|
- loki_compactor_delete_requests_processed_total
|
||||||
|
- loki_compactor_delete_requests_received_total
|
||||||
|
- loki_compactor_deleted_lines
|
||||||
|
- loki_compactor_oldest_pending_delete_request_age_seconds
|
||||||
|
- loki_compactor_pending_delete_requests_count
|
||||||
|
- loki_distributor_lines_received_total
|
||||||
|
- loki_ingester_chunk_age_seconds_bucket
|
||||||
|
- loki_ingester_chunk_age_seconds_count
|
||||||
|
- loki_ingester_chunk_age_seconds_sum
|
||||||
|
- loki_ingester_chunk_bounds_hours_bucket
|
||||||
|
- loki_ingester_chunk_bounds_hours_count
|
||||||
|
- loki_ingester_chunk_bounds_hours_sum
|
||||||
|
- loki_ingester_chunk_entries_bucket
|
||||||
|
- loki_ingester_chunk_entries_count
|
||||||
|
- loki_ingester_chunk_entries_sum
|
||||||
|
- loki_ingester_chunk_size_bytes_bucket
|
||||||
|
- loki_ingester_chunk_utilization_bucket
|
||||||
|
- loki_ingester_chunk_utilization_sum
|
||||||
|
- loki_ingester_chunks_flushed_total
|
||||||
|
- loki_ingester_memory_chunks
|
||||||
|
- loki_ingester_memory_streams
|
||||||
|
- loki_request_duration_seconds_count
|
||||||
|
- loki_ruler_wal_appender_ready
|
||||||
|
- loki_ruler_wal_disk_size
|
||||||
|
- loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds
|
||||||
|
- loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds
|
||||||
|
- loki_ruler_wal_prometheus_remote_storage_samples_pending
|
||||||
|
- loki_ruler_wal_prometheus_remote_storage_samples_total
|
||||||
|
- loki_ruler_wal_samples_appended_total
|
||||||
|
- loki_ruler_wal_storage_created_series_total
|
||||||
|
- loki_write_batch_retries_total
|
||||||
|
- loki_write_dropped_bytes_total
|
||||||
|
- loki_write_dropped_entries_total
|
||||||
|
- loki_write_sent_bytes_total
|
||||||
|
- loki_write_sent_entries_total
|
||||||
|
- node_disk_read_bytes_total
|
||||||
|
- node_disk_written_bytes_total
|
||||||
|
- promtail_custom_bad_words_total
|
||||||
|
|
||||||
# Set enabled = true to add the default logs/metrics/traces dashboards to the local Grafana
|
# Set enabled = true to add the default logs/metrics/traces dashboards to the local Grafana
|
||||||
dashboards:
|
dashboards:
|
||||||
logs:
|
logs:
|
||||||
@@ -90,7 +203,7 @@ loki:
|
|||||||
compactor:
|
compactor:
|
||||||
retention_enabled: true
|
retention_enabled: true
|
||||||
limits_config:
|
limits_config:
|
||||||
retention_period: 24h
|
retention_period: 30d
|
||||||
monitoring:
|
monitoring:
|
||||||
dashboards:
|
dashboards:
|
||||||
enabled: false
|
enabled: false
|
||||||
@@ -109,10 +222,26 @@ loki:
|
|||||||
|
|
||||||
grafana-agent:
|
grafana-agent:
|
||||||
agent:
|
agent:
|
||||||
|
clustering:
|
||||||
|
enabled: true
|
||||||
configMap:
|
configMap:
|
||||||
create: false
|
create: false
|
||||||
name: "agent-configmap"
|
name: "agent-configmap"
|
||||||
key: 'config.river'
|
key: 'config.river'
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: '1000m'
|
||||||
|
memory: '600Mi'
|
||||||
|
limits:
|
||||||
|
memory: '4Gi'
|
||||||
|
controller:
|
||||||
|
type: "statefulset"
|
||||||
|
autoscaling:
|
||||||
|
enabled: true
|
||||||
|
minReplicas: 3
|
||||||
|
maxReplicas: 30
|
||||||
|
targetMemoryUtilizationPercentage: 90
|
||||||
|
targetCPUUtilizationPercentage: 90
|
||||||
|
|
||||||
mimir-distributed:
|
mimir-distributed:
|
||||||
minio:
|
minio:
|
||||||
@@ -142,7 +271,7 @@ mimir-distributed:
|
|||||||
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
||||||
insecure: true
|
insecure: true
|
||||||
limits:
|
limits:
|
||||||
compactor_blocks_retention_period: 24h
|
compactor_blocks_retention_period: 30d
|
||||||
|
|
||||||
tempo-distributed:
|
tempo-distributed:
|
||||||
tempo:
|
tempo:
|
||||||
@@ -158,7 +287,7 @@ tempo-distributed:
|
|||||||
insecure: true
|
insecure: true
|
||||||
compactor:
|
compactor:
|
||||||
compaction:
|
compaction:
|
||||||
block_retention: 24h
|
block_retention: 30d
|
||||||
traces:
|
traces:
|
||||||
otlp:
|
otlp:
|
||||||
http:
|
http:
|
||||||
@@ -193,4 +322,4 @@ minio:
|
|||||||
cpu: 100m
|
cpu: 100m
|
||||||
memory: 128Mi
|
memory: 128Mi
|
||||||
# Changed the mc config path to '/tmp' from '/etc' as '/etc' is only writable by root and OpenShift will not permit this.
|
# Changed the mc config path to '/tmp' from '/etc' as '/etc' is only writable by root and OpenShift will not permit this.
|
||||||
configPathmc: "/tmp/minio/mc/"
|
configPathmc: "/tmp/minio/mc/"
|
||||||
|
@@ -6,7 +6,26 @@
|
|||||||
kubectl create namespace meta
|
kubectl create namespace meta
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml).
|
1. Create secrets with credentials and the endpoint when sending logs, metrics or traces to Grafana Cloud.
|
||||||
|
|
||||||
|
```
|
||||||
|
kubectl create secret generic logs -n meta \
|
||||||
|
--from-literal=username=<logs username> \
|
||||||
|
--from-literal=password=<logs password>
|
||||||
|
--from-literal=endpoint='https://logs-prod-us-central1.grafana.net/loki/api/v1/push'
|
||||||
|
|
||||||
|
kubectl create secret generic metrics -n meta \
|
||||||
|
--from-literal=username=<metrics username> \
|
||||||
|
--from-literal=password=<metrics password>
|
||||||
|
--from-literal=endpoint='https://prometheus-us-central1.grafana.net/api/prom/push'
|
||||||
|
|
||||||
|
kubectl create secret generic traces -n meta \
|
||||||
|
--from-literal=username=<traces username> \
|
||||||
|
--from-literal=password=<traces password>
|
||||||
|
--from-literal=endpoint='https://tempo-us-central1.grafana.net/tempo'
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). Fill in the names of the secrets created above as needed.
|
||||||
|
|
||||||
1. Install this helm chart
|
1. Install this helm chart
|
||||||
|
|
||||||
|
9
tools/kind.config
Normal file
9
tools/kind.config
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
kind: Cluster
|
||||||
|
apiVersion: kind.x-k8s.io/v1alpha4
|
||||||
|
name: meta
|
||||||
|
nodes:
|
||||||
|
- role: control-plane
|
||||||
|
- role: worker
|
||||||
|
- role: worker
|
||||||
|
- role: worker
|
||||||
|
|
Reference in New Issue
Block a user