forked from RemoteSync/grafana-meta-monitoring-chart
Compare commits
144 Commits
add_ci
...
add_creati
Author | SHA1 | Date | |
---|---|---|---|
|
2144cea411 | ||
|
81a017551b | ||
|
11d80263a7 | ||
|
cdb0bee56e | ||
|
58171a6a42 | ||
|
c65445384b | ||
|
1f980f393e | ||
|
47d9190eda | ||
|
5ff9bd16c9 | ||
|
d6faaf88f5 | ||
|
2d711f7168 | ||
|
c666bf69c9 | ||
|
41619b99b1 | ||
|
5923139796 | ||
|
329d5822ea | ||
|
5498b27ad6 | ||
|
da687315e7 | ||
|
8f20e45c77 | ||
|
e81b1246f5 | ||
|
b103fb3434 | ||
|
9349d2d906 | ||
|
31536103c8 | ||
|
13c28aa50a | ||
|
385d0dd543 | ||
|
458451922d | ||
|
4b0d457af0 | ||
|
e60b2aecdc | ||
|
6244de677e | ||
|
d14e933e84 | ||
|
0210fba39d | ||
|
a97fa64880 | ||
|
34545e15b4 | ||
|
33b8e37bed | ||
|
0938193982 | ||
|
b1975505e5 | ||
|
c282bf352d | ||
|
60af0b4d19 | ||
|
0980cb2ede | ||
|
75ab1f0d97 | ||
|
dd49623508 | ||
|
095fb09d26 | ||
|
0fc5e2f847 | ||
|
d04d74cc26 | ||
|
b840555522 | ||
|
e1a8495227 | ||
|
a812b4f63a | ||
|
8dde9642c9 | ||
|
0832bc8e8f | ||
|
cbae75acb8 | ||
|
f5a5472b95 | ||
|
58a4696a6b | ||
|
e31f6b0906 | ||
|
351f50e238 | ||
|
170c17b721 | ||
|
b2d06ab8e3 | ||
|
9d63c32d4f | ||
|
a201cef34c | ||
|
ea4d5e278a | ||
|
e3c3f6a094 | ||
|
d6da6fec35 | ||
|
5e2ffb222b | ||
|
9b12bad16c | ||
|
ed31bcf345 | ||
|
a0184e27d0 | ||
|
3491886311 | ||
|
c58b76cfc7 | ||
|
a72e64327f | ||
|
cfdc6b95eb | ||
|
b1ccef91cb | ||
|
f3f970d783 | ||
|
b78571dfdc | ||
|
1859c3a82c | ||
|
f275b2d1b6 | ||
|
fd1aadc099 | ||
|
a6462d1ac1 | ||
|
690cda9eb5 | ||
|
00cad594f4 | ||
|
e74ec96349 | ||
|
0d3f9a1416 | ||
|
8fa5b63db7 | ||
|
d7063da3d4 | ||
|
e7f28a261e | ||
|
509a32bc59 | ||
|
6bb31ad5e0 | ||
|
7724d9c928 | ||
|
13294675fe | ||
|
bf71def2f8 | ||
|
b37fa4adf5 | ||
|
18a5face81 | ||
|
5e908f796c | ||
|
17b52d572a | ||
|
6eac38d4ec | ||
|
3706c702a1 | ||
|
28b77dab17 | ||
|
9770a3e5b3 | ||
|
6cbffd6d9d | ||
|
4ae23a99d2 | ||
|
20232e9cf3 | ||
|
043a503ce7 | ||
|
39f50d8580 | ||
|
d9fc9e4f4e | ||
|
f61913d3da | ||
|
c29daab64d | ||
|
d389a9f741 | ||
|
6f5f50f901 | ||
|
efea1c5054 | ||
|
b02aee6816 | ||
|
c522e3f39e | ||
|
e3542e472d | ||
|
3a138991ff | ||
|
cd78caab48 | ||
|
f281741de9 | ||
|
381ecb2c06 | ||
|
20cdb8dcc1 | ||
|
019f2b7b1e | ||
|
1bffcac5e5 | ||
|
d23291dc91 | ||
|
a89ba944a3 | ||
|
ef05e599e6 | ||
|
a586e753da | ||
|
76908c1e9e | ||
|
bc5cdadb9f | ||
|
687c77c0f6 | ||
|
2a0b14ee45 | ||
|
7e06d611a7 | ||
|
f4934d6007 | ||
|
427764278c | ||
|
1093e91741 | ||
|
1ed196299b | ||
|
faa0015c11 | ||
|
53416e042c | ||
|
d804da13f1 | ||
|
8c0b68fe02 | ||
|
99bb8f13c2 | ||
|
26ff679cbb | ||
|
fb3e3ece1b | ||
|
7a5358b322 | ||
|
9c92e18efe | ||
|
ffe220590d | ||
|
e3708ce3fe | ||
|
3149f4df9b | ||
|
86ec586917 | ||
|
6cd12bee01 | ||
|
b042b396a2 |
3
.github/configs/cr.yaml
vendored
Normal file
3
.github/configs/cr.yaml
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
owner: grafana
|
||||
git-repo: helm-charts
|
||||
skip-existing: true
|
15
.github/configs/ct.yaml
vendored
Normal file
15
.github/configs/ct.yaml
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
## Reference: https://github.com/helm/chart-testing/blob/master/doc/ct_lint-and-install.md
|
||||
remote: origin
|
||||
target-branch: main
|
||||
chart-dirs:
|
||||
- charts
|
||||
chart-repos:
|
||||
- grafana=https://grafana.github.io/helm-charts
|
||||
- minio=https://charts.min.io
|
||||
validate-chart-schema: true
|
||||
validate-maintainers: true
|
||||
validate-yaml: true
|
||||
exclude-deprecated: true
|
||||
excluded-charts: []
|
||||
namespace: meta-monitoring # Need to set the namespace because we create the secret there
|
||||
release-label: app.kubernetes.io/instance
|
30
.github/configs/updatecli.d/alloy.yaml
vendored
Normal file
30
.github/configs/updatecli.d/alloy.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Bump dependency "alloy" for Helm chart "meta-monitoring"
|
||||
sources:
|
||||
alloy:
|
||||
name: Get latest "alloy" Helm chart version
|
||||
kind: helmchart
|
||||
spec:
|
||||
name: alloy
|
||||
url: https://grafana.github.io/helm-charts
|
||||
versionfilter:
|
||||
kind: semver
|
||||
pattern: '*'
|
||||
conditions:
|
||||
alloy:
|
||||
name: Ensure Helm chart dependency "alloy" is specified
|
||||
kind: yaml
|
||||
spec:
|
||||
file: charts/meta-monitoring/Chart.yaml
|
||||
key: $.dependencies[1].name
|
||||
value: alloy
|
||||
disablesourceinput: true
|
||||
targets:
|
||||
alloy:
|
||||
name: Bump Helm chart dependency "alloy" for Helm chart "meta-monitoring"
|
||||
kind: helmchart
|
||||
spec:
|
||||
file: Chart.yaml
|
||||
key: $.dependencies[1].version
|
||||
name: charts/meta-monitoring
|
||||
versionincrement: none
|
||||
sourceid: alloy
|
30
.github/configs/updatecli.d/loki.yaml
vendored
Normal file
30
.github/configs/updatecli.d/loki.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Bump dependency "loki" for Helm chart "meta-monitoring"
|
||||
sources:
|
||||
loki:
|
||||
name: Get latest "loki" Helm chart version
|
||||
kind: helmchart
|
||||
spec:
|
||||
name: loki
|
||||
url: https://grafana.github.io/helm-charts
|
||||
versionfilter:
|
||||
kind: semver
|
||||
pattern: '*'
|
||||
conditions:
|
||||
loki:
|
||||
name: Ensure Helm chart dependency "loki" is specified
|
||||
kind: yaml
|
||||
spec:
|
||||
file: charts/meta-monitoring/Chart.yaml
|
||||
key: $.dependencies[0].name
|
||||
value: loki
|
||||
disablesourceinput: true
|
||||
targets:
|
||||
loki:
|
||||
name: Bump Helm chart dependency "loki" for Helm chart "meta-monitoring"
|
||||
kind: helmchart
|
||||
spec:
|
||||
file: Chart.yaml
|
||||
key: $.dependencies[0].version
|
||||
name: charts/meta-monitoring
|
||||
versionincrement: none
|
||||
sourceid: loki
|
30
.github/configs/updatecli.d/mimir-distributed.yaml
vendored
Normal file
30
.github/configs/updatecli.d/mimir-distributed.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Bump dependency "mimir-distributed" for Helm chart "meta-monitoring"
|
||||
sources:
|
||||
mimir-distributed:
|
||||
name: Get latest "mimir-distributed" Helm chart version
|
||||
kind: helmchart
|
||||
spec:
|
||||
name: mimir-distributed
|
||||
url: https://grafana.github.io/helm-charts
|
||||
versionfilter:
|
||||
kind: semver
|
||||
pattern: '*'
|
||||
conditions:
|
||||
mimir-distributed:
|
||||
name: Ensure Helm chart dependency "mimir-distributed" is specified
|
||||
kind: yaml
|
||||
spec:
|
||||
file: charts/meta-monitoring/Chart.yaml
|
||||
key: $.dependencies[2].name
|
||||
value: mimir-distributed
|
||||
disablesourceinput: true
|
||||
targets:
|
||||
mimir-distributed:
|
||||
name: Bump Helm chart dependency "mimir-distributed" for Helm chart "meta-monitoring"
|
||||
kind: helmchart
|
||||
spec:
|
||||
file: Chart.yaml
|
||||
key: $.dependencies[2].version
|
||||
name: charts/meta-monitoring
|
||||
versionincrement: none
|
||||
sourceid: mimir-distributed
|
30
.github/configs/updatecli.d/minio.yaml
vendored
Normal file
30
.github/configs/updatecli.d/minio.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Bump dependency "minio" for Helm chart "meta-monitoring"
|
||||
sources:
|
||||
minio:
|
||||
name: Get latest "minio" Helm chart version
|
||||
kind: helmchart
|
||||
spec:
|
||||
name: minio
|
||||
url: https://charts.min.io
|
||||
versionfilter:
|
||||
kind: semver
|
||||
pattern: '*'
|
||||
conditions:
|
||||
minio:
|
||||
name: Ensure Helm chart dependency "minio" is specified
|
||||
kind: yaml
|
||||
spec:
|
||||
file: charts/meta-monitoring/Chart.yaml
|
||||
key: $.dependencies[4].name
|
||||
value: minio
|
||||
disablesourceinput: true
|
||||
targets:
|
||||
minio:
|
||||
name: Bump Helm chart dependency "minio" for Helm chart "meta-monitoring"
|
||||
kind: helmchart
|
||||
spec:
|
||||
file: Chart.yaml
|
||||
key: $.dependencies[4].version
|
||||
name: charts/meta-monitoring
|
||||
versionincrement: none
|
||||
sourceid: minio
|
30
.github/configs/updatecli.d/tempo-distributed.yaml
vendored
Normal file
30
.github/configs/updatecli.d/tempo-distributed.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Bump dependency "tempo-distributed" for Helm chart "meta-monitoring"
|
||||
sources:
|
||||
tempo-distributed:
|
||||
name: Get latest "tempo-distributed" Helm chart version
|
||||
kind: helmchart
|
||||
spec:
|
||||
name: tempo-distributed
|
||||
url: https://grafana.github.io/helm-charts
|
||||
versionfilter:
|
||||
kind: semver
|
||||
pattern: '*'
|
||||
conditions:
|
||||
tempo-distributed:
|
||||
name: Ensure Helm chart dependency "tempo-distributed" is specified
|
||||
kind: yaml
|
||||
spec:
|
||||
file: charts/meta-monitoring/Chart.yaml
|
||||
key: $.dependencies[3].name
|
||||
value: tempo-distributed
|
||||
disablesourceinput: true
|
||||
targets:
|
||||
tempo-distributed:
|
||||
name: Bump Helm chart dependency "tempo-distributed" for Helm chart "meta-monitoring"
|
||||
kind: helmchart
|
||||
spec:
|
||||
file: Chart.yaml
|
||||
key: $.dependencies[3].version
|
||||
name: charts/meta-monitoring
|
||||
versionincrement: none
|
||||
sourceid: tempo-distributed
|
177
.github/workflows/check-for-dependency-updates.yaml
vendored
Normal file
177
.github/workflows/check-for-dependency-updates.yaml
vendored
Normal file
@@ -0,0 +1,177 @@
|
||||
---
|
||||
name: Check for dependency updates
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Run once a day
|
||||
- cron: '0 7 * * *'
|
||||
|
||||
permissions:
|
||||
contents: "write"
|
||||
pull-requests: "write"
|
||||
|
||||
env:
|
||||
UPDATECLI_CONFIG_DIR: "${{ github.workspace }}/.github/configs/updatecli.d"
|
||||
UPDATECLI_GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
|
||||
|
||||
jobs:
|
||||
updateLoki:
|
||||
name: Update the Loki subchart
|
||||
runs-on: "ubuntu-latest"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install Updatecli
|
||||
uses: updatecli/updatecli-action@v2
|
||||
|
||||
- name: Run Updatecli
|
||||
id: update-loki
|
||||
run: |
|
||||
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/loki.yaml
|
||||
if ! git diff --exit-code > /dev/null; then
|
||||
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||
fi
|
||||
|
||||
- name: Create pull request
|
||||
if: steps.update-loki.outputs.changed == 'true'
|
||||
uses: peter-evans/create-pull-request@v5
|
||||
with:
|
||||
title: "[dependency] Update the Loki subchart"
|
||||
body: "Updates the Loki subchart"
|
||||
base: main
|
||||
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
|
||||
committer: "GitHub <noreply@github.com>"
|
||||
commit-message: Update loki
|
||||
labels: dependencies
|
||||
branch: chore/update-loki
|
||||
delete-branch: true
|
||||
|
||||
updateGrafanaAlloy:
|
||||
name: Update the Grafana Alloy subchart
|
||||
runs-on: "ubuntu-latest"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install Updatecli
|
||||
uses: updatecli/updatecli-action@v2
|
||||
|
||||
- name: Run Updatecli
|
||||
id: update-grafana-alloy
|
||||
run: |
|
||||
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/alloy.yaml
|
||||
if ! git diff --exit-code > /dev/null; then
|
||||
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||
fi
|
||||
|
||||
- name: Create pull request
|
||||
if: steps.update-grafana-alloy.outputs.changed == 'true'
|
||||
uses: peter-evans/create-pull-request@v5
|
||||
with:
|
||||
title: "[dependency] Update the Grafana Alloy subchart"
|
||||
body: "Updates the Grafana Alloy subchart"
|
||||
base: main
|
||||
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
|
||||
committer: "GitHub <noreply@github.com>"
|
||||
commit-message: Update Grafana Alloy
|
||||
labels: dependencies
|
||||
branch: chore/update-grafana-alloy
|
||||
delete-branch: true
|
||||
|
||||
updateMimirDistributed:
|
||||
name: Update the Mimir Distributed subchart
|
||||
runs-on: "ubuntu-latest"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install Updatecli
|
||||
uses: updatecli/updatecli-action@v2
|
||||
|
||||
- name: Run Updatecli
|
||||
id: update-mimir-distributed
|
||||
run: |
|
||||
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/mimir-distributed.yaml
|
||||
if ! git diff --exit-code > /dev/null; then
|
||||
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||
fi
|
||||
|
||||
- name: Create pull request
|
||||
if: steps.update-mimir-distributed.outputs.changed == 'true'
|
||||
uses: peter-evans/create-pull-request@v5
|
||||
with:
|
||||
title: "[dependency] Update the Mimir Distributed subchart"
|
||||
body: "Updates the Mimir Distributed subchart"
|
||||
base: main
|
||||
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
|
||||
committer: "GitHub <noreply@github.com>"
|
||||
commit-message: Update Mimir Distributed
|
||||
labels: dependencies
|
||||
branch: chore/update-mimir-distributed
|
||||
delete-branch: true
|
||||
|
||||
updateTempoDistributed:
|
||||
name: Update the Tempo Distributed subchart
|
||||
runs-on: "ubuntu-latest"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install Updatecli
|
||||
uses: updatecli/updatecli-action@v2
|
||||
|
||||
- name: Run Updatecli
|
||||
id: update-tempo-distributed
|
||||
run: |
|
||||
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/tempo-distributed.yaml
|
||||
if ! git diff --exit-code > /dev/null; then
|
||||
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||
fi
|
||||
|
||||
- name: Create pull request
|
||||
if: steps.update-tempo-distributed.outputs.changed == 'true'
|
||||
uses: peter-evans/create-pull-request@v5
|
||||
with:
|
||||
title: "[dependency] Update the Tempo Distributed subchart"
|
||||
body: "Updates the tempo Distributed subchart"
|
||||
base: main
|
||||
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
|
||||
committer: "GitHub <noreply@github.com>"
|
||||
commit-message: Update Tempo Distributed
|
||||
labels: dependencies
|
||||
branch: chore/update-tempo-distributed
|
||||
delete-branch: true
|
||||
|
||||
updateMinio:
|
||||
name: Update the Minio subchart
|
||||
runs-on: "ubuntu-latest"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install Updatecli
|
||||
uses: updatecli/updatecli-action@v2
|
||||
|
||||
- name: Run Updatecli
|
||||
id: update-minio
|
||||
run: |
|
||||
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/minio.yaml
|
||||
if ! git diff --exit-code > /dev/null; then
|
||||
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||
fi
|
||||
|
||||
- name: Create pull request
|
||||
if: steps.update-minio.outputs.changed == 'true'
|
||||
uses: peter-evans/create-pull-request@v5
|
||||
with:
|
||||
title: "[dependency] Update the Minio subchart"
|
||||
body: "Updates the Minio subchart"
|
||||
base: main
|
||||
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
|
||||
committer: "GitHub <noreply@github.com>"
|
||||
commit-message: Update minio
|
||||
labels: dependencies
|
||||
branch: chore/update-minio
|
||||
delete-branch: true
|
111
.github/workflows/helm-ci.yml
vendored
111
.github/workflows/helm-ci.yml
vendored
@@ -16,86 +16,51 @@ jobs:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
# - name: Check Docs
|
||||
# run: |
|
||||
# docker run --rm --volume "$(pwd):/helm-docs" -u "$(id -u)" jnorwood/helm-docs:v1.11.0
|
||||
# if ! git diff --exit-code; then
|
||||
# echo "Documentation not up to date. Please run helm-docs and commit changes!" >&2
|
||||
# exit 1
|
||||
# fi
|
||||
|
||||
- name: Lint Yaml
|
||||
run: make helm-lint
|
||||
|
||||
# - name: Lint Code Base
|
||||
# uses: docker://github/super-linter:v3.12.0
|
||||
# env:
|
||||
# FILTER_REGEX_EXCLUDE: .*(README\.md|Chart\.yaml|NOTES.txt).*
|
||||
# FILTER_REGEX_INCLUDE: .*charts/meta-monitoring/.*
|
||||
# VALIDATE_ALL_CODEBASE: false
|
||||
# VALIDATE_KUBERNETES_KUBEVAL: false
|
||||
# VALIDATE_YAML: false
|
||||
# VALIDATE_GO: false
|
||||
# DEFAULT_BRANCH: main
|
||||
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
call-test:
|
||||
name: Test Helm Chart
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
# call-test:
|
||||
# name: Test Helm Chart
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v3
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@v3
|
||||
with:
|
||||
version: v3.8.2
|
||||
# - name: Set up Helm
|
||||
# uses: azure/setup-helm@v3
|
||||
# with:
|
||||
# version: v3.8.2
|
||||
|
||||
# Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and
|
||||
# yamllint (https://github.com/adrienverge/yamllint) which require Python
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.7
|
||||
# # Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and
|
||||
# # yamllint (https://github.com/adrienverge/yamllint) which require Python
|
||||
# - name: Set up Python
|
||||
# uses: actions/setup-python@v4
|
||||
# with:
|
||||
# python-version: 3.7
|
||||
|
||||
- name: Set up chart-testing
|
||||
uses: helm/chart-testing-action@v2.4.0
|
||||
# - name: Set up chart-testing
|
||||
# uses: helm/chart-testing-action@v2.4.0
|
||||
|
||||
- name: Run chart-testing (list-changed)
|
||||
id: list-changed
|
||||
run: |
|
||||
changed=$(ct list-changed --config "${CT_CONFIGFILE}")
|
||||
if [[ -n "$changed" ]]; then
|
||||
echo "changed=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
# - name: Run chart-testing (list-changed)
|
||||
# id: list-changed
|
||||
# run: |
|
||||
# changed=$(ct list-changed --config "${CT_CONFIGFILE}")
|
||||
# if [[ -n "$changed" ]]; then
|
||||
# echo "changed=true" >> $GITHUB_OUTPUT
|
||||
# fi
|
||||
|
||||
- name: Run chart-testing (lint)
|
||||
run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false
|
||||
# - name: Run chart-testing (lint)
|
||||
# run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false
|
||||
|
||||
- name: Create kind cluster
|
||||
uses: helm/kind-action@v1.8.0
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
# - name: Create kind cluster
|
||||
# uses: helm/kind-action@v1.8.0
|
||||
# if: steps.list-changed.outputs.changed == 'true'
|
||||
# with:
|
||||
# config: tools/kind.config
|
||||
|
||||
# - name: Install prometheus operator
|
||||
# id: install-prometheus
|
||||
# if: steps.list-changed.outputs.changed == 'true'
|
||||
# run: |
|
||||
# kubectl create namespace prometheus
|
||||
|
||||
# helm install prometheus prometheus-community/kube-prometheus-stack \
|
||||
# --namespace prometheus \
|
||||
# --set grafana.enabled=false \
|
||||
# --set prometheus.prometheusSpec.serviceMonitorSelector.matchLabels.release=prometheus
|
||||
|
||||
# kubectl --namespace prometheus get pods -l "release=prometheus"
|
||||
# kubectl --namespace prometheus get services -l "release=prometheus"
|
||||
|
||||
- name: Run chart-testing (install)
|
||||
run: |
|
||||
changed=$(ct list-changed --config "${CT_CONFIGFILE}")
|
||||
if [[ "$changed" == "charts/enterprise-metrics" ]]; then
|
||||
# Do not run `ct install` for enterprise-metrics
|
||||
exit 0
|
||||
fi
|
||||
ct install --config "${CT_CONFIGFILE}"
|
||||
# - name: Run chart-testing (install)
|
||||
# run: |
|
||||
# changed=$(ct list-changed --config "${CT_CONFIGFILE}")
|
||||
# ct install --config "${CT_CONFIGFILE}"
|
||||
|
175
.github/workflows/helm-release.yml
vendored
Normal file
175
.github/workflows/helm-release.yml
vendored
Normal file
@@ -0,0 +1,175 @@
|
||||
name: Release Helm chart
|
||||
on:
|
||||
workflow_dispatch:
|
||||
env:
|
||||
CR_CONFIGFILE: "${{ github.workspace }}/source/.github/configs/cr.yaml"
|
||||
CT_CONFIGFILE: "${{ github.workspace }}/source/.github/configs/ct.yaml"
|
||||
CR_INDEX_PATH: "${{ github.workspace }}/.cr-index"
|
||||
CR_PACKAGE_PATH: "${{ github.workspace }}/.cr-release-packages"
|
||||
CR_TOOL_PATH: "${{ github.workspace }}/.cr-tool"
|
||||
CR_VERSION: "1.5.0"
|
||||
jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
changed: ${{ steps.list-changed.outputs.changed }}
|
||||
chartpath: ${{ steps.list-changed.outputs.chartpath }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
path: source
|
||||
|
||||
- name: Install chart-testing
|
||||
uses: helm/chart-testing-action@v2
|
||||
|
||||
- name: List changed charts
|
||||
id: list-changed
|
||||
run: |
|
||||
cd source
|
||||
|
||||
latest_tag=$( if ! git describe --tags --abbrev=0 --match='helm-chart/*' 2> /dev/null ; then git rev-list --max-parents=0 --first-parent HEAD; fi )
|
||||
|
||||
echo "Running: ct list-changed --config ${CT_CONFIGFILE} --since ${latest_tag} --target-branch ${{ github.ref_name }}"
|
||||
changed=$(ct list-changed --config "${CT_CONFIGFILE}" --since "${latest_tag}" --target-branch "${{ github.ref_name }}")
|
||||
echo "${changed}"
|
||||
|
||||
num_changed=$(wc -l <<< ${changed})
|
||||
if [[ "${num_changed}" -gt "1" ]] ; then
|
||||
echo "More than one chart changed, exiting"
|
||||
exit 1
|
||||
fi
|
||||
if [[ -n "${changed}" ]]; then
|
||||
name=$(yq ".name" < ${changed}/Chart.yaml)
|
||||
version=$(yq ".version" < ${changed}/Chart.yaml)
|
||||
tagname="v${version}"
|
||||
|
||||
if [ $(git tag -l "${tagname}") ]; then
|
||||
echo "Tag ${tagname} already exists, skipping release"
|
||||
echo "changed=false" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "Releasing ${changed}"
|
||||
echo "changed=true" >> $GITHUB_OUTPUT
|
||||
echo "chartpath=${changed}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
else
|
||||
echo "No charts have changed, skipping release"
|
||||
echo "changed=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
release:
|
||||
needs: [setup]
|
||||
runs-on: ubuntu-latest
|
||||
if: needs.setup.outputs.changed == 'true'
|
||||
permissions:
|
||||
contents: write
|
||||
id-token: write
|
||||
steps:
|
||||
- id: get-secrets
|
||||
uses: grafana/shared-workflows/actions/get-vault-secrets@main
|
||||
with:
|
||||
# Secrets placed in the ci/repo/grafana/<repo>/<path> path in Vault
|
||||
repo_secrets: |
|
||||
APP_ID=github-app:app-id
|
||||
PRIVATE_KEY=github-app:private-key
|
||||
- uses: actions/create-github-app-token@v1
|
||||
id: app-token
|
||||
with:
|
||||
app-id: ${{ env.APP_ID }}
|
||||
private-key: ${{ env.PRIVATE_KEY }}
|
||||
owner: ${{ github.repository_owner }}
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
path: source
|
||||
|
||||
- name: Configure Git
|
||||
run: |
|
||||
cd source
|
||||
git config user.name "$GITHUB_ACTOR"
|
||||
git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
|
||||
|
||||
- name: Checkout helm-charts
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
repository: grafana/helm-charts
|
||||
path: helm-charts
|
||||
token: "${{ steps.app-token.outputs.token }}"
|
||||
|
||||
- name: Configure Git for helm-charts
|
||||
run: |
|
||||
cd helm-charts
|
||||
git config user.name "$GITHUB_ACTOR"
|
||||
git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
|
||||
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@v4
|
||||
|
||||
- name: Parse Chart.yaml
|
||||
id: parse-chart
|
||||
run: |
|
||||
cd source
|
||||
changed="${{ needs.setup.outputs.chartpath }}"
|
||||
description=$(yq ".description" < ${changed}/Chart.yaml)
|
||||
name=$(yq ".name" < ${changed}/Chart.yaml)
|
||||
version=$(yq ".version" < ${changed}/Chart.yaml)
|
||||
|
||||
echo "chartpath=${changed}" >> $GITHUB_OUTPUT
|
||||
echo "desc=${description}" >> $GITHUB_OUTPUT
|
||||
echo "tagname=v${version}" >> $GITHUB_OUTPUT
|
||||
echo "packagename=${name}-${version}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Install CR tool
|
||||
run: |
|
||||
mkdir "${CR_TOOL_PATH}"
|
||||
mkdir "${CR_PACKAGE_PATH}"
|
||||
mkdir "${CR_INDEX_PATH}"
|
||||
curl -sSLo cr.tar.gz "https://github.com/helm/chart-releaser/releases/download/v${CR_VERSION}/chart-releaser_${CR_VERSION}_linux_amd64.tar.gz"
|
||||
tar -xzf cr.tar.gz -C "${CR_TOOL_PATH}"
|
||||
rm -f cr.tar.gz
|
||||
|
||||
- name: Create Helm package
|
||||
run: |
|
||||
cd source
|
||||
helm repo add grafana https://grafana.github.io/helm-charts
|
||||
helm repo add minio https://charts.min.io
|
||||
|
||||
"${CR_TOOL_PATH}/cr" package "${{ steps.parse-chart.outputs.chartpath }}" --config "${CR_CONFIGFILE}" --package-path "${CR_PACKAGE_PATH}"
|
||||
|
||||
- name: Make a release on this repo
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
name: ${{ steps.parse-chart.outputs.tagname }}
|
||||
repository: grafana/meta-monitoring-chart
|
||||
tag_name: ${{ steps.parse-chart.outputs.tagname }}
|
||||
token: ${{ steps.app-token.outputs.token }}
|
||||
generate_release_notes: true
|
||||
files: |
|
||||
${{ env.CR_PACKAGE_PATH }}/${{ steps.parse-chart.outputs.packagename }}.tgz
|
||||
|
||||
# Note that this creates a release in grafana/helm-charts with a new tag.
|
||||
# The tag name in grafana/helm-charts is <package>-<version>, while the
|
||||
# tag name for grafana/meta-monitoring-chart is <version>.
|
||||
- name: Make release on Helm Charts
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
name: ${{ steps.parse-chart.outputs.packagename }}
|
||||
repository: grafana/helm-charts
|
||||
tag_name: ${{ steps.parse-chart.outputs.packagename }}
|
||||
token: ${{ steps.app-token.outputs.token }}
|
||||
body: |
|
||||
${{ steps.parse-chart.outputs.desc }}
|
||||
|
||||
Source commit: https://github.com/${{ github.repository }}/commit/${{ github.sha }}
|
||||
|
||||
Tag on source: https://github.com/${{ github.repository }}/releases/tag/${{ steps.parse-chart.outputs.tagname }}
|
||||
files: |
|
||||
${{ env.CR_PACKAGE_PATH }}/${{ steps.parse-chart.outputs.packagename }}.tgz
|
||||
|
||||
- name: Update helm-charts index.yaml
|
||||
run: |
|
||||
cd helm-charts
|
||||
"${CR_TOOL_PATH}/cr" index --config "${CR_CONFIGFILE}" --token "${{ steps.app-token.outputs.token }}" --index-path "${CR_INDEX_PATH}" --package-path "${CR_PACKAGE_PATH}" --push
|
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
production/
|
14
Makefile
14
Makefile
@@ -6,5 +6,17 @@ help:
|
||||
|
||||
.PHONY: helm-lint
|
||||
|
||||
helm-lint: ## run helm linter
|
||||
helm-lint: ## Run helm linter
|
||||
$(MAKE) -BC charts/meta-monitoring lint
|
||||
|
||||
MIXIN_PATH := production/loki-mixin
|
||||
MIXIN_OUT_PATH_META_MONITORING := production/loki-mixin-compiled-meta-monitoring
|
||||
|
||||
mixin: ## Create our version of the mixin
|
||||
@rm -rf $(MIXIN_PATH)
|
||||
./scripts/clone_loki_mixin.sh
|
||||
@rm -rf $(MIXIN_OUT_PATH_META_MONITORING) && mkdir $(MIXIN_OUT_PATH_META_MONITORING)
|
||||
@cd $(MIXIN_PATH) && jb install
|
||||
@mixtool generate all --output-alerts $(MIXIN_OUT_PATH_META_MONITORING)/alerts.yaml --output-rules $(MIXIN_OUT_PATH_META_MONITORING)/rules.yaml --directory $(MIXIN_OUT_PATH_META_MONITORING)/dashboards ${MIXIN_PATH}/mixin-meta-monitoring.libsonnet
|
||||
@cp $(MIXIN_OUT_PATH_META_MONITORING)/dashboards/* charts/meta-monitoring/src/dashboards
|
||||
@cp $(MIXIN_OUT_PATH_META_MONITORING)/rules.yaml charts/meta-monitoring/src/rules/loki-rules.yaml
|
||||
|
20
README.md
20
README.md
@@ -1,20 +1,9 @@
|
||||
# meta-monitoring-chart
|
||||
|
||||
This is a meta-monitoring chart for GEL, GEM and GET. It should be installed in a
|
||||
separate namespace next to GEM, GEL or GET installations.
|
||||
This is a meta-monitoring chart for Loki.
|
||||
|
||||
Note that this is pre-production software at the moment.
|
||||
|
||||
## Preparation
|
||||
|
||||
Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml).
|
||||
|
||||
1. Add or remove the namespaces to monitor in the `namespacesToMonitor` setting
|
||||
|
||||
1. Set the cluster name in the `clusterName` setting. This will be added as a label to all logs, metrics and traces.
|
||||
|
||||
1. Create a `meta` namespace.
|
||||
|
||||
## Local and cloud modes
|
||||
|
||||
The chart has 2 modes: local and cloud. In the local mode logs, metrics and/or traces are sent
|
||||
@@ -34,12 +23,6 @@ Both modes can be enabled at the same time.
|
||||
|
||||
## Installation
|
||||
|
||||
```
|
||||
helm install -n meta --skip-crds -f values.yaml meta ./charts/meta-monitoring
|
||||
```
|
||||
|
||||
If the platform supports CRDs the `--skip-crds` option can be removed. However the CRDs are not used by this chart.
|
||||
|
||||
For more instructions including how to update the chart go to the [installation](docs/installation.md) page.
|
||||
|
||||
## Supported features
|
||||
@@ -59,7 +42,6 @@ Most of these features are enabled by default. See the values.yaml file for how
|
||||
|
||||
## Caveats
|
||||
|
||||
- The [loki.source.kubernetes](https://grafana.com/docs/agent/latest/flow/reference/components/loki.source.kubernetes/) component of the Grafana Agent is used to scrape Kubernetes log files. This component is marked experimental at the moment.
|
||||
- This has not been tested on Openshift yet.
|
||||
- The underlying Loki, Mimir and Tempo are at the default size installed by the Helm chart. This might need changing when monitoring bigger Loki, Mimir or Tempo installations.
|
||||
- MinIO is used as storage at the moment with a limited retention. At the moment this chart cannot be used for monitoring over longer periods.
|
||||
|
@@ -1,18 +1,18 @@
|
||||
dependencies:
|
||||
- name: loki
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 5.8.0
|
||||
- name: grafana-agent
|
||||
version: 6.3.4
|
||||
- name: alloy
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 0.15.0
|
||||
version: 0.1.1
|
||||
- name: mimir-distributed
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 4.4.1
|
||||
version: 5.3.0
|
||||
- name: tempo-distributed
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 1.4.7
|
||||
version: 1.9.4
|
||||
- name: minio
|
||||
repository: https://charts.min.io
|
||||
version: 5.0.11
|
||||
digest: sha256:da0e744b5046eb7972e0bf82d1d0ba4786e9600af63b65f35b16118105248074
|
||||
generated: "2023-08-16T16:08:36.406791+01:00"
|
||||
version: 5.1.0
|
||||
digest: sha256:4bb2a4f62c9ebddcd64c28a94126ab3f07d319b028ea7c17ffbdf28d86b3be61
|
||||
generated: "2024-04-25T07:02:28.663945601Z"
|
||||
|
@@ -1,7 +1,6 @@
|
||||
apiVersion: v2
|
||||
name: meta-monitoring
|
||||
description: A Helm chart for meta monitoring Grafana Loki, Mimir and Tempo
|
||||
|
||||
# A chart can be either an 'application' or a 'library' chart.
|
||||
#
|
||||
# Application charts are a collection of templates that can be packaged into versioned archives
|
||||
@@ -11,35 +10,32 @@ description: A Helm chart for meta monitoring Grafana Loki, Mimir and Tempo
|
||||
# a dependency of application charts to inject those utilities and functions into the rendering
|
||||
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
||||
type: application
|
||||
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.0.1
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "0.0.1"
|
||||
|
||||
dependencies:
|
||||
- name: loki
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: "5.8.0"
|
||||
version: 6.3.4
|
||||
condition: local.logs.enabled
|
||||
- name: grafana-agent
|
||||
- name: alloy
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: "0.15.0"
|
||||
version: 0.1.1
|
||||
- name: mimir-distributed
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: "4.4.1"
|
||||
version: 5.3.0
|
||||
condition: local.metrics.enabled
|
||||
- name: tempo-distributed
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: "1.4.7"
|
||||
version: 1.9.4
|
||||
condition: local.traces.enabled
|
||||
- name: minio
|
||||
repository: https://charts.min.io
|
||||
version: "5.0.11"
|
||||
version: 5.1.0
|
||||
condition: local.minio.enabled
|
||||
|
@@ -1,4 +1,4 @@
|
||||
.DEFAULT_GOAL := all
|
||||
.DEFAULT_GOAL := lint
|
||||
.PHONY: lint lint-yaml
|
||||
|
||||
lint: lint-yaml
|
||||
|
BIN
charts/meta-monitoring/charts/alloy-0.1.1.tgz
Normal file
BIN
charts/meta-monitoring/charts/alloy-0.1.1.tgz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/loki-6.3.4.tgz
Normal file
BIN
charts/meta-monitoring/charts/loki-6.3.4.tgz
Normal file
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/mimir-distributed-5.3.0.tgz
Normal file
BIN
charts/meta-monitoring/charts/mimir-distributed-5.3.0.tgz
Normal file
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/minio-5.1.0.tgz
Normal file
BIN
charts/meta-monitoring/charts/minio-5.1.0.tgz
Normal file
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/tempo-distributed-1.9.4.tgz
Normal file
BIN
charts/meta-monitoring/charts/tempo-distributed-1.9.4.tgz
Normal file
Binary file not shown.
@@ -8,4 +8,4 @@ chart-repos:
|
||||
- minio=https://charts.min.io
|
||||
helm-extra-args: --timeout 1200s
|
||||
check-version-increment: false
|
||||
validate-maintainers: false
|
||||
validate-maintainers: false
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -62,7 +62,6 @@
|
||||
"expr": "sum(loki_compactor_pending_delete_requests_count{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@@ -138,7 +137,6 @@
|
||||
"expr": "max(loki_compactor_oldest_pending_delete_request_age_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@@ -191,232 +189,145 @@
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [ ]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"id": 3,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"} or on() vector(0)) - on () (loki_compactor_delete_requests_processed_total{cluster=~\"$cluster\", namespace=~\"$namespace\"} or on () vector(0))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "in progress",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
"legendLink": null
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "# of Delete Requests (received - processed) ",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [ ]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"id": 4,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "received",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
"legendLink": null
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Delete Requests Received / Day",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [ ]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"id": 5,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(loki_compactor_delete_requests_processed_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "processed",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
"legendLink": null
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Delete Requests Processed / Day",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
@@ -431,232 +342,145 @@
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [ ]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"id": 6,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{pod}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
"legendLink": null
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Compactor CPU usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 7,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [ ]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"id": 7,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"} / 1024 / 1024 ",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": " {{pod}} ",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
"legendLink": null
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Compactor memory usage (MiB)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 8,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [ ]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"id": 8,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "loki_boltdb_shipper_compact_tables_operation_duration_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{pod}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
"legendLink": null
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Compaction run duration (seconds)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
@@ -671,156 +495,98 @@
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [ ]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"id": 9,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(loki_compactor_load_pending_requests_attempts_total{status=\"fail\", cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "failures",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
"legendLink": null
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Failures in Loading Delete Requests / Hour",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 10,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [ ]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"id": 10,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\",job=~\"$namespace/(loki|enterprise-logs)-read\"}[$__rate_interval])) by (user)",
|
||||
"expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(compactor|(loki|enterprise-logs)-backend.*|loki-single-binary)\"}[$__rate_interval])) by (user)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{user}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
"legendLink": null
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Lines Deleted / Sec",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
@@ -840,7 +606,7 @@
|
||||
"span": 6,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"} |~ \"Started processing delete request|delete request for user marked as processed\" | logfmt | line_format \"{{.ts}} user={{.user}} delete_request_id={{.delete_request_id}} msg={{.msg}}\" ",
|
||||
"expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(compactor|(loki|enterprise-logs)-backend.*|loki-single-binary)\"} |~ \"Started processing delete request|delete request for user marked as processed\" | logfmt | line_format \"{{.ts}} user={{.user}} delete_request_id={{.delete_request_id}} msg={{.msg}}\" ",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@@ -853,7 +619,7 @@
|
||||
"span": 6,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"} |~ \"delete request for user added\" | logfmt | line_format \"{{.ts}} user={{.user}} query='{{.query}}'\"",
|
||||
"expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(compactor|(loki|enterprise-logs)-backend.*|loki-single-binary)\"} |~ \"delete request for user added\" | logfmt | line_format \"{{.ts}} user={{.user}} query='{{.query}}'\"",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@@ -882,7 +648,7 @@
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Data Source",
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
|
@@ -77,7 +77,7 @@
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -114,6 +114,11 @@
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -164,7 +169,7 @@
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -236,7 +241,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))",
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[$__rate_interval]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@@ -250,7 +255,7 @@
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -287,6 +292,11 @@
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes"
|
||||
}
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -336,7 +346,7 @@
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -373,6 +383,11 @@
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "binBps"
|
||||
}
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -408,7 +423,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))",
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@@ -422,7 +437,7 @@
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -459,6 +474,11 @@
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "binBps"
|
||||
}
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -494,7 +514,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))",
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@@ -508,7 +528,7 @@
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -595,7 +615,7 @@
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -632,6 +652,11 @@
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops"
|
||||
}
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -667,7 +692,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)",
|
||||
"expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[$__rate_interval])) by (level)",
|
||||
"legendFormat": "{{level}}",
|
||||
"refId": "A"
|
||||
}
|
||||
@@ -682,7 +707,7 @@
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -718,7 +743,12 @@
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$logs",
|
||||
"datasource": "$loki_datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops"
|
||||
}
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
@@ -771,7 +801,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)",
|
||||
"expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" | __error__=\"\" [$__auto])) by (level)",
|
||||
"intervalFactor": 3,
|
||||
"legendFormat": "{{level}}",
|
||||
"refId": "A"
|
||||
@@ -787,7 +817,7 @@
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"type": "timeseries",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
@@ -819,7 +849,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"datasource": "$logs",
|
||||
"datasource": "$loki_datasource",
|
||||
"gridPos": {
|
||||
"h": 19,
|
||||
"w": 24,
|
||||
@@ -861,7 +891,7 @@
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Data Source",
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
@@ -918,7 +948,7 @@
|
||||
{
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "logs",
|
||||
"name": "loki_datasource",
|
||||
"options": [ ],
|
||||
"query": "loki",
|
||||
"refresh": 1,
|
||||
@@ -1069,4 +1099,4 @@
|
||||
"title": "Loki / Logs",
|
||||
"uid": "logs",
|
||||
"version": 0
|
||||
}
|
||||
}
|
@@ -300,7 +300,8 @@
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
@@ -600,7 +601,7 @@
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Data Source",
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,53 +1,52 @@
|
||||
groups:
|
||||
- name: "loki_rules"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:loki_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:loki_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[5m]))
|
||||
by (cluster, job)"
|
||||
record: "cluster_job:loki_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job)"
|
||||
record: "cluster_job:loki_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||
record: "cluster_job:loki_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:loki_request_duration_seconds_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, route))"
|
||||
record: "cluster_job_route:loki_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, route))"
|
||||
record: "cluster_job_route:loki_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job, route)
|
||||
/ sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||
record: "cluster_job_route:loki_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job,
|
||||
route)"
|
||||
record: "cluster_job_route:loki_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job, route)"
|
||||
record: "cluster_job_route:loki_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||
record: "cluster_job_route:loki_request_duration_seconds_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, namespace, job, route))"
|
||||
record: "cluster_namespace_job_route:loki_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, namespace, job, route))"
|
||||
record: "cluster_namespace_job_route:loki_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||
job, route) / sum(rate(loki_request_duration_seconds_count[5m])) by (cluster,
|
||||
namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:loki_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, namespace,
|
||||
job, route)"
|
||||
record: "cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||
job, route)"
|
||||
record: "cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, namespace,
|
||||
job, route)"
|
||||
record: "cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate"
|
||||
record: "cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate"
|
@@ -1,322 +1,317 @@
|
||||
groups:
|
||||
- name: "mimir_api_1"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[5m]))
|
||||
by (cluster, job)"
|
||||
record: "cluster_job:cortex_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job)"
|
||||
record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_request_duration_seconds_count:sum_rate"
|
||||
- name: "mimir_api_2"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, route))"
|
||||
record: "cluster_job_route:cortex_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, route))"
|
||||
record: "cluster_job_route:cortex_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)
|
||||
/ sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||
record: "cluster_job_route:cortex_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job,
|
||||
route)"
|
||||
record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)"
|
||||
record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||
record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate"
|
||||
- name: "mimir_api_3"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, namespace, job, route))"
|
||||
record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, namespace, job, route))"
|
||||
record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||
job, route) / sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster,
|
||||
namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, namespace,
|
||||
job, route)"
|
||||
record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||
job, route)"
|
||||
record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, namespace,
|
||||
job, route)"
|
||||
record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate"
|
||||
- name: "mimir_querier_api"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_querier_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_querier_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||
job) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_querier_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, route))"
|
||||
record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, route))"
|
||||
record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by
|
||||
(cluster, job, route)"
|
||||
record: "cluster_job_route:cortex_querier_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||
job, route)"
|
||||
record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||
job, route)"
|
||||
record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||
job, route)"
|
||||
record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, namespace, job, route))"
|
||||
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, namespace, job, route))"
|
||||
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m]))
|
||||
by (cluster, namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||
namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||
namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||
namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||
- name: "mimir_cache"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, method))"
|
||||
record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, method))"
|
||||
record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
|
||||
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster,
|
||||
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[5m]))
|
||||
by (cluster, job, method)"
|
||||
record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
- expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||
job, method)"
|
||||
record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster,
|
||||
job, method)"
|
||||
record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_memcache_request_duration_seconds_count[5m])) by (cluster,
|
||||
job, method)"
|
||||
record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_cache_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_cache_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)
|
||||
/ sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_cache_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, method))"
|
||||
record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job, method))"
|
||||
record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job,
|
||||
method) / sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||
job, method)"
|
||||
record: "cluster_job_method:cortex_cache_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||
job, method)"
|
||||
record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job,
|
||||
method)"
|
||||
record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||
job, method)"
|
||||
record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate"
|
||||
- name: "mimir_storage"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_kv_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_kv_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)
|
||||
/ sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_kv_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
- expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate"
|
||||
- name: "mimir_queries"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_query_frontend_retries:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_query_frontend_retries:50quantile"
|
||||
- expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
|
||||
- expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[5m]))
|
||||
by (cluster, job)"
|
||||
record: "cluster_job:cortex_query_frontend_retries:avg"
|
||||
- expr: "sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)"
|
||||
- expr: "sum(rate(cortex_query_frontend_retries_bucket[5m])) by (le, cluster, job)"
|
||||
record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_query_frontend_retries_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_query_frontend_retries_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
|
||||
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster,
|
||||
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by
|
||||
(cluster, job)"
|
||||
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg"
|
||||
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
|
||||
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m])) by (le,
|
||||
cluster, job)"
|
||||
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
|
||||
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by (cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate"
|
||||
- name: "mimir_ingester_queries"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_ingester_queried_series:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_ingester_queried_series:50quantile"
|
||||
- expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
|
||||
- expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[5m]))
|
||||
by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_series:avg"
|
||||
- expr: "sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_series_bucket[5m])) by (le, cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_series_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_series_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_ingester_queried_samples:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_ingester_queried_samples:50quantile"
|
||||
- expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
|
||||
- expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[5m]))
|
||||
by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_samples:avg"
|
||||
- expr: "sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_samples_bucket[5m])) by (le, cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_samples_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_ingester_queried_exemplars:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
||||
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[5m]))
|
||||
by (le, cluster, job))"
|
||||
record: "cluster_job:cortex_ingester_queried_exemplars:50quantile"
|
||||
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
|
||||
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job) /
|
||||
sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_exemplars:avg"
|
||||
- expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
|
||||
- expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[5m])) by (le, cluster,
|
||||
job)"
|
||||
record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate"
|
||||
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate"
|
||||
- expr: "sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)"
|
||||
- expr: "sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)"
|
||||
record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate"
|
||||
- name: "mimir_received_samples"
|
||||
rules:
|
||||
- expr: "|
|
||||
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))"
|
||||
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))"
|
||||
record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m"
|
||||
- name: "mimir_exemplars_in"
|
||||
rules:
|
||||
- expr: "|
|
||||
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))"
|
||||
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))"
|
||||
record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m"
|
||||
- name: "mimir_received_exemplars"
|
||||
rules:
|
||||
- expr: "|
|
||||
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))"
|
||||
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))"
|
||||
record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m"
|
||||
- name: "mimir_exemplars_ingested"
|
||||
rules:
|
||||
- expr: "|
|
||||
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))"
|
||||
- expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))"
|
||||
record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m"
|
||||
- name: "mimir_exemplars_appended"
|
||||
rules:
|
||||
- expr: "|
|
||||
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))"
|
||||
- expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))"
|
||||
record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m"
|
||||
- name: "mimir_scaling_rules"
|
||||
rules:
|
||||
- expr: "|
|
||||
- expr: |
|
||||
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
|
||||
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
|
||||
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||
sum by (cluster, namespace, deployment) (
|
||||
label_replace(
|
||||
kube_deployment_spec_replicas,
|
||||
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
||||
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||
# always matches everything and the (optional) zone is not removed.
|
||||
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||
)
|
||||
)
|
||||
or
|
||||
sum by (cluster, namespace, deployment) (
|
||||
label_replace(kube_statefulset_replicas, \"deployment\", \"$1\", \"statefulset\", \"(.*?)(?:-zone-[a-z])?\")
|
||||
)"
|
||||
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
|
||||
)
|
||||
record: "cluster_namespace_deployment:actual_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
ceil(
|
||||
quantile_over_time(0.99,
|
||||
sum by (cluster, namespace) (
|
||||
@@ -324,21 +319,21 @@ groups:
|
||||
)[24h:]
|
||||
)
|
||||
/ 240000
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
deployment: "distributor"
|
||||
reason: "sample_rate"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
ceil(
|
||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"})
|
||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||
* 0.59999999999999998 / 240000
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
deployment: "distributor"
|
||||
reason: "sample_rate_limits"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
ceil(
|
||||
quantile_over_time(0.99,
|
||||
sum by (cluster, namespace) (
|
||||
@@ -346,12 +341,12 @@ groups:
|
||||
)[24h:]
|
||||
)
|
||||
* 3 / 80000
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
deployment: "ingester"
|
||||
reason: "sample_rate"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
ceil(
|
||||
quantile_over_time(0.99,
|
||||
sum by(cluster, namespace) (
|
||||
@@ -359,59 +354,59 @@ groups:
|
||||
)[24h:]
|
||||
)
|
||||
/ 1500000
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
deployment: "ingester"
|
||||
reason: "active_series"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
ceil(
|
||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"max_global_series_per_user\"})
|
||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
|
||||
* 3 * 0.59999999999999998 / 1500000
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
deployment: "ingester"
|
||||
reason: "active_series_limits"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
ceil(
|
||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"})
|
||||
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||
* 0.59999999999999998 / 80000
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
deployment: "ingester"
|
||||
reason: "sample_rate_limits"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
ceil(
|
||||
(sum by (cluster, namespace) (
|
||||
cortex_ingester_tsdb_storage_blocks_bytes{job=~\".+/ingester.*\"}
|
||||
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
|
||||
) / 4)
|
||||
/
|
||||
avg by (cluster, namespace) (
|
||||
memcached_limit_bytes{job=~\".+/memcached\"}
|
||||
memcached_limit_bytes{job=~".+/memcached"}
|
||||
)
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
deployment: "memcached"
|
||||
reason: "active_series"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
sum by (cluster, namespace, deployment) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
|
||||
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
||||
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[5m])),
|
||||
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||
),
|
||||
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
||||
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||
# always matches everything and the (optional) zone is not removed.
|
||||
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||
)
|
||||
)"
|
||||
)
|
||||
record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
# Convenience rule to get the CPU request for both a deployment and a statefulset.
|
||||
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
|
||||
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
||||
# that remove resource metrics, ref:
|
||||
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
||||
@@ -424,11 +419,11 @@ groups:
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_container_resource_requests_cpu_cores,
|
||||
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
||||
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||
),
|
||||
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
||||
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||
# always matches everything and the (optional) zone is not removed.
|
||||
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -439,17 +434,17 @@ groups:
|
||||
sum by (cluster, namespace, deployment) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_container_resource_requests{resource=\"cpu\"},
|
||||
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
||||
kube_pod_container_resource_requests{resource="cpu"},
|
||||
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||
),
|
||||
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
||||
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||
# always matches everything and the (optional) zone is not removed.
|
||||
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||
)
|
||||
)
|
||||
)"
|
||||
)
|
||||
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
# Jobs should be sized to their CPU usage.
|
||||
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||
# their current provisioned #replicas and resource requests.
|
||||
@@ -459,28 +454,28 @@ groups:
|
||||
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
|
||||
/
|
||||
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
reason: "cpu_usage"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
|
||||
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
|
||||
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||
sum by (cluster, namespace, deployment) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
container_memory_usage_bytes{image!=\"\"},
|
||||
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
||||
container_memory_usage_bytes{image!=""},
|
||||
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||
),
|
||||
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
||||
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||
# always matches everything and the (optional) zone is not removed.
|
||||
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||
)
|
||||
)"
|
||||
)
|
||||
record: "cluster_namespace_deployment:container_memory_usage_bytes:sum"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
# Convenience rule to get the Memory request for both a deployment and a statefulset.
|
||||
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
|
||||
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
||||
# that remove resource metrics, ref:
|
||||
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
||||
@@ -493,11 +488,11 @@ groups:
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_container_resource_requests_memory_bytes,
|
||||
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
||||
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||
),
|
||||
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
||||
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||
# always matches everything and the (optional) zone is not removed.
|
||||
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -508,17 +503,17 @@ groups:
|
||||
sum by (cluster, namespace, deployment) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_container_resource_requests{resource=\"memory\"},
|
||||
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
|
||||
kube_pod_container_resource_requests{resource="memory"},
|
||||
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||
),
|
||||
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
|
||||
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||
# always matches everything and the (optional) zone is not removed.
|
||||
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
|
||||
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||
)
|
||||
)
|
||||
)"
|
||||
)
|
||||
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum"
|
||||
- expr: "|
|
||||
- expr: |
|
||||
# Jobs should be sized to their Memory usage.
|
||||
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||
# their current provisioned #replicas and resource requests.
|
||||
@@ -528,44 +523,33 @@ groups:
|
||||
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
|
||||
/
|
||||
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
)"
|
||||
)
|
||||
labels:
|
||||
reason: "memory_usage"
|
||||
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||
- name: "mimir_alertmanager_rules"
|
||||
rules:
|
||||
- expr: "|
|
||||
sum by (cluster, job, pod) (cortex_alertmanager_alerts)"
|
||||
- expr: "sum by (cluster, job, pod) (cortex_alertmanager_alerts)"
|
||||
record: "cluster_job_pod:cortex_alertmanager_alerts:sum"
|
||||
- expr: "|
|
||||
sum by (cluster, job, pod) (cortex_alertmanager_silences)"
|
||||
- expr: "sum by (cluster, job, pod) (cortex_alertmanager_silences)"
|
||||
record: "cluster_job_pod:cortex_alertmanager_silences:sum"
|
||||
- expr: "|
|
||||
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))"
|
||||
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))"
|
||||
record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m"
|
||||
- expr: "|
|
||||
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))"
|
||||
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))"
|
||||
record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m"
|
||||
- expr: "|
|
||||
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))"
|
||||
- expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))"
|
||||
record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m"
|
||||
- expr: "|
|
||||
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))"
|
||||
- expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))"
|
||||
record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m"
|
||||
- expr: "|
|
||||
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))"
|
||||
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))"
|
||||
record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m"
|
||||
- expr: "|
|
||||
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))"
|
||||
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))"
|
||||
record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m"
|
||||
- expr: "|
|
||||
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))"
|
||||
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))"
|
||||
record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m"
|
||||
- expr: "|
|
||||
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))"
|
||||
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))"
|
||||
record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m"
|
||||
- name: "mimir_ingester_rules"
|
||||
rules:
|
||||
- expr: "|
|
||||
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))"
|
||||
- expr: "sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[5m]))"
|
||||
record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m"
|
||||
|
@@ -1,15 +1,15 @@
|
||||
groups:
|
||||
- name: "tempo_rules"
|
||||
rules:
|
||||
- expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))"
|
||||
- expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))"
|
||||
record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))"
|
||||
- expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))"
|
||||
record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile"
|
||||
- expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)"
|
||||
- expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg"
|
||||
- expr: "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)"
|
||||
- expr: "sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate"
|
||||
- expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)"
|
||||
- expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate"
|
||||
- expr: "sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)"
|
||||
- expr: "sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)"
|
||||
record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"
|
||||
|
33
charts/meta-monitoring/templates/_helpers.tpl
Normal file
33
charts/meta-monitoring/templates/_helpers.tpl
Normal file
@@ -0,0 +1,33 @@
|
||||
{{/*
|
||||
Return the appropriate apiVersion for ingress.
|
||||
*/}}
|
||||
{{- define "ingress.apiVersion" -}}
|
||||
{{- if and (.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19-0" .Capabilities.KubeVersion.Version) -}}
|
||||
{{- print "networking.k8s.io/v1" -}}
|
||||
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" -}}
|
||||
{{- print "networking.k8s.io/v1beta1" -}}
|
||||
{{- else -}}
|
||||
{{- print "extensions/v1beta1" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return if ingress is stable.
|
||||
*/}}
|
||||
{{- define "ingress.isStable" -}}
|
||||
{{- eq (include "ingress.apiVersion" .) "networking.k8s.io/v1" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return if ingress supports ingressClassName.
|
||||
*/}}
|
||||
{{- define "ingress.supportsIngressClassName" -}}
|
||||
{{- or (eq (include "ingress.isStable" .) "true") (and (eq (include "ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Return if ingress supports pathType.
|
||||
*/}}
|
||||
{{- define "ingress.supportsPathType" -}}
|
||||
{{- or (eq (include "ingress.isStable" .) "true") (and (eq (include "ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) -}}
|
||||
{{- end -}}
|
@@ -18,10 +18,10 @@
|
||||
{{- end }}
|
||||
|
||||
{{- define "agent.loki_process_targets" -}}
|
||||
{{- if empty .Values.logs.piiRegexes }}
|
||||
{{- if and (empty .Values.logs.piiRegexes) (empty .Values.logs.retain) }}
|
||||
{{- include "agent.loki_write_targets" . }}
|
||||
{{- else }}
|
||||
{{- printf "loki.process.PII.receiver" }}
|
||||
{{- printf "loki.process.filter.receiver" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
|
@@ -8,7 +8,7 @@ data:
|
||||
discovery.kubernetes "pods" {
|
||||
role = "pod"
|
||||
namespaces {
|
||||
own_namespace = false
|
||||
own_namespace = true
|
||||
names = [ {{ include "agent.namespaces" . }} ]
|
||||
}
|
||||
}
|
||||
@@ -33,22 +33,38 @@ data:
|
||||
}
|
||||
rule {
|
||||
target_label = "cluster"
|
||||
replacement = "{{- .Values.clusterName -}}"
|
||||
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||
}
|
||||
}
|
||||
|
||||
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
||||
// Logs
|
||||
|
||||
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
||||
remote.kubernetes.secret "logs_credentials" {
|
||||
namespace = "{{- $.Release.Namespace -}}"
|
||||
name = "{{- .Values.cloud.logs.secret -}}"
|
||||
}
|
||||
|
||||
loki.source.kubernetes "pods" {
|
||||
clustering {
|
||||
enabled = true
|
||||
}
|
||||
targets = discovery.relabel.rename_meta_labels.output
|
||||
forward_to = [ {{ include "agent.loki_process_targets" . }} ]
|
||||
}
|
||||
|
||||
{{- if not (empty .Values.logs.piiRegexes) }}
|
||||
loki.process "PII" {
|
||||
{{- if or (not (empty .Values.logs.retain)) (not (empty .Values.logs.piiRegexes)) }}
|
||||
loki.process "filter" {
|
||||
forward_to = [ {{ include "agent.loki_write_targets" . }} ]
|
||||
|
||||
{{- if not (empty .Values.logs.retain) }}
|
||||
stage.match {
|
||||
selector = "{cluster=\"{{- .Values.clusterLabelValue -}}\", namespace=~\"{{- join "|" .Values.namespacesToMonitor -}}|{{- $.Release.Namespace -}}\", pod=~\"loki.*\"} !~ \"{{ join "|" .Values.logs.retain }}\""
|
||||
action = "drop"
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
{{- if not (empty .Values.logs.piiRegexes) }}
|
||||
{{- range .Values.logs.piiRegexes }}
|
||||
stage.replace {
|
||||
expression = "{{ .expression }}"
|
||||
@@ -56,26 +72,85 @@ data:
|
||||
replace = "{{ .replace }}"
|
||||
}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
||||
// Metrics
|
||||
|
||||
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
||||
remote.kubernetes.secret "metrics_credentials" {
|
||||
namespace = "{{- $.Release.Namespace -}}"
|
||||
name = "{{- .Values.cloud.metrics.secret -}}"
|
||||
}
|
||||
|
||||
discovery.kubernetes "metric_pods" {
|
||||
role = "pod"
|
||||
namespaces {
|
||||
own_namespace = true
|
||||
names = [ {{ include "agent.namespaces" . }} ]
|
||||
}
|
||||
}
|
||||
|
||||
discovery.relabel "only_http_metrics" {
|
||||
targets = discovery.kubernetes.metric_pods.targets
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_namespace"]
|
||||
target_label = "namespace"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_name"]
|
||||
target_label = "pod"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||
separator = "/"
|
||||
regex = "(.*)/(.*)/(.*)"
|
||||
replacement = "${1}/${2}-${3}"
|
||||
target_label = "job"
|
||||
}
|
||||
rule {
|
||||
target_label = "cluster"
|
||||
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_container_port_number"]
|
||||
action = "drop"
|
||||
regex = "9095"
|
||||
}
|
||||
}
|
||||
|
||||
prometheus.scrape "pods" {
|
||||
targets = discovery.relabel.rename_meta_labels.output
|
||||
clustering {
|
||||
enabled = true
|
||||
}
|
||||
targets = discovery.relabel.only_http_metrics.output
|
||||
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||
}
|
||||
|
||||
prometheus.relabel "filter" {
|
||||
rule {
|
||||
source_labels = ["__name__"]
|
||||
regex = "({{ join "|" .Values.metrics.retain }})"
|
||||
action = "keep"
|
||||
}
|
||||
|
||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||
}
|
||||
{{- if .Values.kubeStateMetrics.enabled }}
|
||||
|
||||
prometheus.scrape "kubeStateMetrics" {
|
||||
clustering {
|
||||
enabled = true
|
||||
}
|
||||
targets = [ { "__address__" = "{{ .Values.kubeStateMetrics.endpoint }}" } ]
|
||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
// cAdvisor and Kubelete metrics
|
||||
// cAdvisor and Kubelet metrics
|
||||
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
|
||||
discovery.kubernetes "all_nodes" {
|
||||
role = "node"
|
||||
@@ -104,15 +179,17 @@ data:
|
||||
}
|
||||
rule {
|
||||
target_label = "cluster"
|
||||
replacement = "{{- .Values.clusterName -}}"
|
||||
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||
}
|
||||
}
|
||||
|
||||
prometheus.scrape "cadvisor" {
|
||||
clustering {
|
||||
enabled = true
|
||||
}
|
||||
targets = discovery.relabel.all_nodes.output
|
||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||
|
||||
scrape_interval = "15s"
|
||||
metrics_path = "/metrics/cadvisor"
|
||||
scheme = "https"
|
||||
|
||||
@@ -123,10 +200,12 @@ data:
|
||||
}
|
||||
|
||||
prometheus.scrape "kubelet" {
|
||||
clustering {
|
||||
enabled = true
|
||||
}
|
||||
targets = discovery.relabel.all_nodes.output
|
||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||
|
||||
scrape_interval = "15s"
|
||||
metrics_path = "/metrics"
|
||||
scheme = "https"
|
||||
|
||||
@@ -136,18 +215,20 @@ data:
|
||||
}
|
||||
}
|
||||
|
||||
prometheus.exporter.unix {}
|
||||
prometheus.exporter.unix "promexporter" {}
|
||||
|
||||
prometheus.scrape "node_exporter" {
|
||||
targets = prometheus.exporter.unix.targets
|
||||
clustering {
|
||||
enabled = true
|
||||
}
|
||||
targets = prometheus.exporter.unix.promexporter.targets
|
||||
forward_to = [prometheus.relabel.node_exporter.receiver]
|
||||
|
||||
job_name = "node-exporter"
|
||||
scrape_interval = "15s"
|
||||
}
|
||||
|
||||
prometheus.relabel "node_exporter" {
|
||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||
|
||||
rule {
|
||||
replacement = env("HOSTNAME")
|
||||
@@ -178,14 +259,19 @@ data:
|
||||
}
|
||||
rule {
|
||||
target_label = "cluster"
|
||||
replacement = "{{- .Values.clusterName -}}"
|
||||
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||
}
|
||||
}
|
||||
{{- end }}
|
||||
|
||||
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
||||
// Traces
|
||||
|
||||
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
||||
remote.kubernetes.secret "traces_credentials" {
|
||||
namespace = "{{- $.Release.Namespace -}}"
|
||||
name = "{{- .Values.cloud.traces.secret -}}"
|
||||
}
|
||||
|
||||
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
||||
otelcol.receiver.otlp "otlp_receiver" {
|
||||
// We don't technically need this, but it shows how to change listen address and incoming port.
|
||||
@@ -254,11 +340,10 @@ data:
|
||||
{{- if .Values.cloud.logs.enabled }}
|
||||
loki.write "cloud" {
|
||||
endpoint {
|
||||
url = "{{- .Values.cloud.logs.endpoint -}}/loki/api/v1/push"
|
||||
|
||||
url = nonsensitive(remote.kubernetes.secret.logs_credentials.data["endpoint"])
|
||||
basic_auth {
|
||||
username = "{{- .Values.cloud.logs.username -}}"
|
||||
password = "{{- .Values.cloud.logs.password -}}"
|
||||
username = nonsensitive(remote.kubernetes.secret.logs_credentials.data["username"])
|
||||
password = remote.kubernetes.secret.logs_credentials.data["password"]
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -267,11 +352,10 @@ data:
|
||||
{{- if .Values.cloud.metrics.enabled }}
|
||||
prometheus.remote_write "cloud" {
|
||||
endpoint {
|
||||
url = "{{- .Values.cloud.metrics.endpoint -}}/api/prom/push"
|
||||
|
||||
url = nonsensitive(remote.kubernetes.secret.metrics_credentials.data["endpoint"])
|
||||
basic_auth {
|
||||
username = "{{- .Values.cloud.metrics.username -}}"
|
||||
password = "{{- .Values.cloud.metrics.password -}}"
|
||||
username = nonsensitive(remote.kubernetes.secret.metrics_credentials.data["username"])
|
||||
password = remote.kubernetes.secret.metrics_credentials.data["password"]
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -280,13 +364,13 @@ data:
|
||||
{{- if .Values.cloud.traces.enabled }}
|
||||
otelcol.exporter.otlp "cloud" {
|
||||
client {
|
||||
endpoint = "{{- .Values.cloud.traces.endpoint -}}"
|
||||
endpoint = nonsensitive(remote.kubernetes.secret.traces_credentials.data["endpoint"])
|
||||
auth = otelcol.auth.basic.creds.handler
|
||||
}
|
||||
}
|
||||
|
||||
otelcol.auth.basic "creds" {
|
||||
username = "{{- .Values.cloud.traces.username -}}"
|
||||
password = "{{- .Values.cloud.traces.password -}}"
|
||||
username = nonsensitive(remote.kubernetes.secret.traces_credentials.data["username"])
|
||||
password = remote.kubernetes.secret.traces_credentials.data["password"]
|
||||
}
|
||||
{{- end }}
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.traces.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
@@ -14,6 +14,6 @@ data:
|
||||
{{ $.Files.Get "src/dashboards/agent-remote-write.json" | fromJson | toJson }}
|
||||
"agent-tracing-pipeline.json": |
|
||||
{{ $.Files.Get "src/dashboards/agent-tracing-pipeline.json" | fromJson | toJson }}
|
||||
"agent-overview.json": |
|
||||
"agent.json": |
|
||||
{{ $.Files.Get "src/dashboards/agent.json" | fromJson | toJson }}
|
||||
{{- end }}
|
@@ -1,4 +1,4 @@
|
||||
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -1,4 +1,4 @@
|
||||
{{- if or (or .Values.local.logs.enabled .Values.local.metrics.enabled) .Values.local.traces.enabled }}
|
||||
{{- if .Values.local.grafana.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -0,0 +1,57 @@
|
||||
{{- if and .Values.local.grafana.enabled .Values.grafana.ingress.enabled -}}
|
||||
{{- $ingressApiIsStable := eq (include "ingress.isStable" .) "true" -}}
|
||||
{{- $ingressSupportsIngressClassName := eq (include "ingress.supportsIngressClassName" .) "true" -}}
|
||||
{{- $ingressSupportsPathType := eq (include "ingress.supportsPathType" .) "true" -}}
|
||||
apiVersion: {{ include "ingress.apiVersion" . }}
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: {{ $.Release.Namespace }}
|
||||
labels:
|
||||
app: grafana
|
||||
{{- range $labelKey, $labelValue := .Values.grafana.ingress.labels }}
|
||||
{{ $labelKey }}: {{ $labelValue | toYaml }}
|
||||
{{- end }}
|
||||
{{- with .Values.grafana.ingress.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if and $ingressSupportsIngressClassName .Values.grafana.ingress.ingressClassName }}
|
||||
ingressClassName: {{ .Values.grafana.ingress.ingressClassName }}
|
||||
{{- end -}}
|
||||
{{- if .Values.grafana.ingress.tls }}
|
||||
tls:
|
||||
{{- range .Values.grafana.ingress.tls }}
|
||||
- hosts:
|
||||
{{- range .hosts }}
|
||||
- {{ tpl . $ | quote }}
|
||||
{{- end }}
|
||||
{{- with .secretName }}
|
||||
secretName: {{ . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
{{- range .Values.grafana.ingress.hosts }}
|
||||
- host: {{ tpl .host $ | quote }}
|
||||
http:
|
||||
paths:
|
||||
{{- range .paths }}
|
||||
- path: {{ .path }}
|
||||
{{- if $ingressSupportsPathType }}
|
||||
pathType: {{ .pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if $ingressApiIsStable }}
|
||||
service:
|
||||
name: grafana
|
||||
port:
|
||||
number: 3000
|
||||
{{- else }}
|
||||
serviceName: grafana
|
||||
servicePort: 3000
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@@ -1,4 +1,4 @@
|
||||
{{- if or (or .Values.local.logs.enabled .Values.local.metrics.enabled) .Values.local.traces.enabled }}
|
||||
{{- if .Values.local.grafana.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.logs.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.logs.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.metrics.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.metrics.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.metrics.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.metrics.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.metrics.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
||||
{{- if .Values.dashboards.traces.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled .Values.dashboards.traces.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
|
@@ -1,8 +1,9 @@
|
||||
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||
{{- if .Values.local.grafana.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: meta-mimir-ruler-for-dashboards
|
||||
name: {{ $.Release.Namespace }}-mimir-ruler-for-dashboards
|
||||
namespace: {{ $.Release.Namespace }}
|
||||
spec:
|
||||
progressDeadlineSeconds: 600
|
||||
@@ -91,8 +92,6 @@ spec:
|
||||
runAsUser: 10001
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
serviceAccount: meta-mimir
|
||||
serviceAccountName: meta-mimir
|
||||
terminationGracePeriodSeconds: 180
|
||||
topologySpreadConstraints:
|
||||
- labelSelector:
|
||||
@@ -109,11 +108,11 @@ spec:
|
||||
items:
|
||||
- key: mimir.yaml
|
||||
path: mimir.yaml
|
||||
name: meta-mimir-config
|
||||
name: {{ $.Release.Namespace }}-mimir-config
|
||||
name: config
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: meta-mimir-runtime
|
||||
name: {{ $.Release.Namespace }}-mimir-runtime
|
||||
name: runtime-config
|
||||
- emptyDir: {}
|
||||
name: storage
|
||||
@@ -124,3 +123,4 @@ spec:
|
||||
name: rules
|
||||
name: rules
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
@@ -1,4 +1,5 @@
|
||||
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||
{{- if .Values.local.metrics.enabled }}
|
||||
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
@@ -16,3 +17,4 @@ data:
|
||||
{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
@@ -3,20 +3,20 @@
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Values.cloud.logs.enabled true -}}
|
||||
{{- if or (empty .Values.cloud.logs.endpoint) (or (empty .Values.cloud.logs.username) (empty .Values.cloud.logs.password)) -}}
|
||||
{{- fail "if cloud.logs is enabled then the endpoint, username and password have to be filled in" -}}
|
||||
{{- if empty .Values.cloud.logs.secret -}}
|
||||
{{- fail "if cloud.logs is enabled then the secret has to be filled in" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Values.cloud.metrics.enabled true -}}
|
||||
{{- if or (empty .Values.cloud.metrics.endpoint) (or (empty .Values.cloud.metrics.username) (empty .Values.cloud.metrics.password)) -}}
|
||||
{{- fail "if cloud.metrics is enabled then the endpoint, username and password have to be filled in" -}}
|
||||
{{- if empty .Values.cloud.metrics.secret -}}
|
||||
{{- fail "if cloud.metrics is enabled then the secret has to be filled in" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if eq .Values.cloud.traces.enabled true -}}
|
||||
{{- if or (empty .Values.cloud.traces.endpoint) (or (empty .Values.cloud.traces.username) (empty .Values.cloud.traces.password)) -}}
|
||||
{{- fail "if cloud.traces is enabled then the endpoint, username and password have to be filled in" -}}
|
||||
{{- if empty .Values.cloud.traces.secret -}}
|
||||
{{- fail "if cloud.traces is enabled then the secret has to be filled in" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
@@ -37,3 +37,7 @@
|
||||
{{- if empty .Values.namespacesToMonitor -}}
|
||||
{{- fail "No namespaces have been specified in namespacesToMonitor" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- if empty .Values.metrics.retain -}}
|
||||
{{- fail "All metrics will be collected, please specify some in metrics.retain" -}}
|
||||
{{- end -}}
|
||||
|
@@ -1,49 +1,178 @@
|
||||
# Specify the namespaces to monitor here
|
||||
namespacesToMonitor:
|
||||
- loki
|
||||
- mimir
|
||||
- tempo
|
||||
# The name of the cluster where this will be installed
|
||||
clusterName: "meta-monitoring"
|
||||
clusterLabelValue: "meta-monitoring"
|
||||
|
||||
# Set to true to write logs, metrics or traces to Grafana Cloud
|
||||
# The secrets have to be created first
|
||||
cloud:
|
||||
logs:
|
||||
enabled: true
|
||||
secret: "logs"
|
||||
metrics:
|
||||
enabled: true
|
||||
secret: "metrics"
|
||||
traces:
|
||||
enabled: true
|
||||
secret: "traces"
|
||||
|
||||
# Set to true for a local version of logs, metrics or traces
|
||||
local:
|
||||
grafana:
|
||||
enabled: false
|
||||
logs:
|
||||
enabled: true
|
||||
enabled: false
|
||||
metrics:
|
||||
enabled: true
|
||||
enabled: false
|
||||
traces:
|
||||
enabled: true
|
||||
enabled: false
|
||||
minio:
|
||||
enabled: true # This should be set to true if any of the previous is enabled
|
||||
enabled: false # This should be set to true if any of the previous is enabled
|
||||
|
||||
grafana:
|
||||
# Gateway ingress configuration
|
||||
ingress:
|
||||
# -- Specifies whether an ingress for the gateway should be created
|
||||
enabled: true
|
||||
# -- Ingress Class Name. MAY be required for Kubernetes versions >= 1.18
|
||||
ingressClassName: ""
|
||||
# -- Annotations for the gateway ingress
|
||||
annotations: { }
|
||||
# -- Labels for the gateway ingress
|
||||
labels: { }
|
||||
# -- Hosts configuration for the gateway ingress, passed through the `tpl` function to allow templating
|
||||
hosts:
|
||||
- host: monitoring.example.com
|
||||
paths:
|
||||
- path: /
|
||||
# -- pathType (e.g. ImplementationSpecific, Prefix, .. etc.) might also be required by some Ingress Controllers
|
||||
# pathType: Prefix
|
||||
# -- TLS configuration for the gateway ingress. Hosts passed through the `tpl` function to allow templating
|
||||
#tls:
|
||||
# - secretName: grafana-tls
|
||||
# hosts:
|
||||
# - monitoring.example.com
|
||||
|
||||
# Set to true to write logs, metrics or traces to Grafana Cloud
|
||||
cloud:
|
||||
logs:
|
||||
enabled: false
|
||||
endpoint:
|
||||
username:
|
||||
password:
|
||||
metrics:
|
||||
enabled: false
|
||||
endpoint:
|
||||
username:
|
||||
password:
|
||||
traces:
|
||||
enabled: false
|
||||
endpoint:
|
||||
username:
|
||||
password:
|
||||
|
||||
# Adding regexes here will add a stage.replace block for logs. For more information see
|
||||
# https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block
|
||||
logs:
|
||||
# Adding regexes here will add a stage.replace block for logs. For more information see
|
||||
# https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block
|
||||
piiRegexes:
|
||||
# This example replaces the word after password with *****
|
||||
# - expression: "password (\\\\S+)"
|
||||
# source: "" # Empty uses the log message
|
||||
# replace: "*****""
|
||||
|
||||
# The lines matching these will be kept in Loki
|
||||
retain:
|
||||
# This shows the queries
|
||||
- caller=metrics.go
|
||||
# This shows any errors
|
||||
- level=error
|
||||
# This shows the ingest requests and is very noisy. Uncomment to include.
|
||||
# - caller=push.go
|
||||
# Log lines for delete requests
|
||||
- delete request for user added
|
||||
- Started processing delete request
|
||||
- delete request for user marked as processed
|
||||
|
||||
metrics:
|
||||
# The list of metrics to retain for logging dashboards
|
||||
retain:
|
||||
- agent_config_last_load_success_timestamp_seconds
|
||||
- agent_config_last_load_successful
|
||||
- agent_config_load_failures_total
|
||||
- container_cpu_usage_seconds_total
|
||||
- container_fs_writes_bytes_total
|
||||
- container_memory_working_set_bytes
|
||||
- container_network_receive_bytes_total
|
||||
- container_network_transmit_bytes_total
|
||||
- container_spec_cpu_period
|
||||
- container_spec_cpu_quota
|
||||
- container_spec_memory_limit_bytes
|
||||
- cortex_ingester_flush_queue_length
|
||||
- cortex_prometheus_rule_group_iterations_total
|
||||
- cortex_prometheus_rule_evaluation_failures_total
|
||||
- cortex_prometheus_rule_group_rules
|
||||
- cortex_prometheus_rule_group_last_duration_seconds
|
||||
- cortex_prometheus_rule_group_last_evaluation_timestamp_seconds
|
||||
- cortex_prometheus_rule_group_iterations_missed_total
|
||||
- go_gc_duration_seconds
|
||||
- go_goroutines
|
||||
- go_memstats_heap_inuse_bytes
|
||||
- kubelet_volume_stats_used_bytes
|
||||
- kubelet_volume_stats_capacity_bytes
|
||||
- kube_persistentvolumeclaim_labels
|
||||
- kube_pod_container_resource_requests
|
||||
- kube_pod_container_status_last_terminated_reason
|
||||
- kube_pod_container_status_restarts_total
|
||||
- loki_boltdb_shipper_compact_tables_operation_duration_seconds
|
||||
- loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds
|
||||
- loki_boltdb_shipper_retention_marker_count_total
|
||||
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_bucket
|
||||
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_count
|
||||
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_sum
|
||||
- loki_boltdb_shipper_retention_marker_table_processed_total
|
||||
- loki_boltdb_shipper_request_duration_seconds_bucket
|
||||
- loki_boltdb_shipper_request_duration_seconds_count
|
||||
- loki_boltdb_shipper_request_duration_seconds_sum
|
||||
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_bucket
|
||||
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_count
|
||||
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_sum
|
||||
- loki_boltdb_shipper_retention_sweeper_marker_files_current
|
||||
- loki_boltdb_shipper_retention_sweeper_marker_file_processing_current_time
|
||||
- loki_build_info
|
||||
- loki_chunk_store_deduped_chunks_total
|
||||
- loki_chunk_store_index_entries_per_chunk_bucket
|
||||
- loki_chunk_store_index_entries_per_chunk_count
|
||||
- loki_chunk_store_index_entries_per_chunk_sum
|
||||
- loki_compactor_delete_requests_processed_total
|
||||
- loki_compactor_delete_requests_received_total
|
||||
- loki_compactor_deleted_lines
|
||||
- loki_compactor_oldest_pending_delete_request_age_seconds
|
||||
- loki_compactor_pending_delete_requests_count
|
||||
- loki_discarded_samples_total
|
||||
- loki_distributor_bytes_received_total
|
||||
- loki_distributor_lines_received_total
|
||||
- loki_distributor_structured_metadata_bytes_received_total
|
||||
- loki_ingester_chunk_age_seconds_bucket
|
||||
- loki_ingester_chunk_age_seconds_count
|
||||
- loki_ingester_chunk_age_seconds_sum
|
||||
- loki_ingester_chunk_bounds_hours_bucket
|
||||
- loki_ingester_chunk_bounds_hours_count
|
||||
- loki_ingester_chunk_bounds_hours_sum
|
||||
- loki_ingester_chunk_entries_bucket
|
||||
- loki_ingester_chunk_entries_count
|
||||
- loki_ingester_chunk_entries_sum
|
||||
- loki_ingester_chunk_size_bytes_bucket
|
||||
- loki_ingester_chunk_utilization_bucket
|
||||
- loki_ingester_chunk_utilization_sum
|
||||
- loki_ingester_chunks_flushed_total
|
||||
- loki_ingester_flush_queue_length
|
||||
- loki_ingester_memory_chunks
|
||||
- loki_ingester_memory_streams
|
||||
- loki_ingester_streams_created_total
|
||||
- loki_request_duration_seconds_bucket
|
||||
- loki_request_duration_seconds_count
|
||||
- loki_request_duration_seconds_sum
|
||||
- loki_ruler_wal_appender_ready
|
||||
- loki_ruler_wal_disk_size
|
||||
- loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds
|
||||
- loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds
|
||||
- loki_ruler_wal_prometheus_remote_storage_samples_pending
|
||||
- loki_ruler_wal_prometheus_remote_storage_samples_total
|
||||
- loki_ruler_wal_samples_appended_total
|
||||
- loki_ruler_wal_storage_created_series_total
|
||||
- loki_write_batch_retries_total
|
||||
- loki_write_dropped_bytes_total
|
||||
- loki_write_dropped_entries_total
|
||||
- loki_write_sent_bytes_total
|
||||
- loki_write_sent_entries_total
|
||||
- node_disk_read_bytes_total
|
||||
- node_disk_written_bytes_total
|
||||
- promtail_custom_bad_words_total
|
||||
|
||||
# Set enabled = true to add the default logs/metrics/traces dashboards to the local Grafana
|
||||
dashboards:
|
||||
logs:
|
||||
@@ -72,6 +201,15 @@ kubeStateMetrics:
|
||||
loki:
|
||||
loki:
|
||||
auth_enabled: false
|
||||
schemaConfig:
|
||||
configs:
|
||||
- from: 2024-03-29
|
||||
store: tsdb
|
||||
object_store: s3
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
storage:
|
||||
type: "s3"
|
||||
s3:
|
||||
@@ -89,8 +227,13 @@ loki:
|
||||
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
||||
compactor:
|
||||
retention_enabled: true
|
||||
delete_request_store: s3
|
||||
limits_config:
|
||||
retention_period: 24h
|
||||
retention_period: 30d
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
test:
|
||||
enabled: false
|
||||
monitoring:
|
||||
dashboards:
|
||||
enabled: false
|
||||
@@ -107,12 +250,28 @@ loki:
|
||||
test:
|
||||
enabled: false
|
||||
|
||||
grafana-agent:
|
||||
agent:
|
||||
alloy:
|
||||
alloy:
|
||||
clustering:
|
||||
enabled: true
|
||||
configMap:
|
||||
create: false
|
||||
name: "agent-configmap"
|
||||
key: 'config.river'
|
||||
resources:
|
||||
requests:
|
||||
cpu: '1000m'
|
||||
memory: '600Mi'
|
||||
limits:
|
||||
memory: '4Gi'
|
||||
controller:
|
||||
type: "statefulset"
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 3
|
||||
maxReplicas: 30
|
||||
targetMemoryUtilizationPercentage: 90
|
||||
targetCPUUtilizationPercentage: 90
|
||||
|
||||
mimir-distributed:
|
||||
minio:
|
||||
@@ -142,7 +301,7 @@ mimir-distributed:
|
||||
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
||||
insecure: true
|
||||
limits:
|
||||
compactor_blocks_retention_period: 24h
|
||||
compactor_blocks_retention_period: 30d
|
||||
|
||||
tempo-distributed:
|
||||
tempo:
|
||||
@@ -158,7 +317,7 @@ tempo-distributed:
|
||||
insecure: true
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 24h
|
||||
block_retention: 30d
|
||||
traces:
|
||||
otlp:
|
||||
http:
|
||||
|
10
docs/create_new_release.md
Normal file
10
docs/create_new_release.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# Create a new release
|
||||
|
||||
1. Update the version field in charts/meta-monitoring/Chart.yaml in a new PR. Merge this PR if approved.
|
||||
|
||||
2. On the [Actions tab](https://github.com/grafana/meta-monitoring-chart/actions):
|
||||
- Select `Release Helm chart` in the workflows on the left
|
||||
- Click the `Run workflow` button
|
||||
- Leave the `main` branch as is
|
||||
- Click the green `Run workflow` button
|
||||
|
@@ -1,27 +1,166 @@
|
||||
# Install this chart
|
||||
|
||||
## Preparation for Cloud mode (preferred)
|
||||
|
||||
1. Use an existing Grafana Cloud account or setup a new one. Then create an access token:
|
||||
|
||||
1. In Grafana go to Administration -> Users and Access -> Cloud access policies.
|
||||
|
||||
1. Click `Create access policy`.
|
||||
|
||||
1. Fill in the `Display name` field and check the `Write` check box for metrics, logs and traces. Then click `Create`.
|
||||
|
||||
1. On the newly created access policy click `Add token`.
|
||||
|
||||
1. Fill in the `Token name` field and click `Create`. Make a copy of the token as it will be used later on.
|
||||
|
||||
1. Create the meta namespace
|
||||
|
||||
```
|
||||
kubectl create namespace meta
|
||||
```
|
||||
|
||||
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml).
|
||||
1. Create secrets with credentials and the endpoint when sending logs, metrics or traces to Grafana Cloud.
|
||||
|
||||
```
|
||||
kubectl create secret generic logs -n meta \
|
||||
--from-literal=username=<logs username> \
|
||||
--from-literal=password=<token>
|
||||
--from-literal=endpoint='https://logs-prod-us-central1.grafana.net/loki/api/v1/push'
|
||||
|
||||
kubectl create secret generic metrics -n meta \
|
||||
--from-literal=username=<metrics username> \
|
||||
--from-literal=password=<token>
|
||||
--from-literal=endpoint='https://prometheus-us-central1.grafana.net/api/prom/push'
|
||||
|
||||
kubectl create secret generic traces -n meta \
|
||||
--from-literal=username=<traces username> \
|
||||
--from-literal=password=<token>
|
||||
--from-literal=endpoint='https://tempo-us-central1.grafana.net/tempo'
|
||||
```
|
||||
|
||||
The logs, metrics and traces usernames are the `User / Username / Instance IDs` of the Loki, Prometheus/Mimir and Tempo instances in Grafana Cloud. From `Home` in Grafana click on `Stacks`. Then go to the `Details` pages of Loki, Prometheus/Mimir and Tempo.
|
||||
|
||||
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). Fill in the names of the secrets created above as needed. An example minimal values.yaml looks like this:
|
||||
|
||||
```
|
||||
namespacesToMonitor:
|
||||
- loki
|
||||
|
||||
cloud:
|
||||
logs:
|
||||
enabled: true
|
||||
secret: "logs"
|
||||
metrics:
|
||||
enabled: true
|
||||
secret: "metrics"
|
||||
traces:
|
||||
enabled: true
|
||||
secret: "traces"
|
||||
```
|
||||
|
||||
## Preparation for Local mode
|
||||
|
||||
1. Create the meta namespace
|
||||
|
||||
```
|
||||
kubectl create namespace meta
|
||||
```
|
||||
|
||||
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). An example minimal values.yaml looks like this:
|
||||
|
||||
```
|
||||
namespacesToMonitor:
|
||||
- loki
|
||||
|
||||
cloud:
|
||||
logs:
|
||||
enabled: false
|
||||
metrics:
|
||||
enabled: false
|
||||
traces:
|
||||
enabled: false
|
||||
|
||||
local:
|
||||
grafana:
|
||||
enabled:true
|
||||
logs:
|
||||
enabled: true
|
||||
metrics:
|
||||
enabled: true
|
||||
traces:
|
||||
enabled: true
|
||||
minio:
|
||||
enabled: true
|
||||
```
|
||||
|
||||
## Installing the chart
|
||||
|
||||
1. Add the repo
|
||||
|
||||
```
|
||||
helm repo add grafana https://grafana.github.io/helm-charts
|
||||
```
|
||||
|
||||
1. Fetch the latest charts from the grafana repo
|
||||
|
||||
```
|
||||
helm repo update grafana
|
||||
```
|
||||
|
||||
|
||||
1. Install this helm chart
|
||||
|
||||
```
|
||||
helm install -n meta -f values.yaml meta ./charts/meta-monitoring
|
||||
helm install -n meta -f values.yaml meta grafana/meta-monitoring
|
||||
```
|
||||
|
||||
1. Upgrade
|
||||
|
||||
```
|
||||
helm upgrade --install -f values.yaml -n meta meta ./charts/meta-monitoring
|
||||
helm upgrade --install -f values.yaml -n meta meta grafana/meta-monitoring
|
||||
```
|
||||
|
||||
1. Delete this chart:
|
||||
|
||||
```
|
||||
helm delete -n meta meta
|
||||
```
|
||||
```
|
||||
|
||||
## Installing the dashboards and rules on Grafana Cloud
|
||||
|
||||
## Installing the dashboards on Grafana Cloud
|
||||
|
||||
Only the files for the application monitored have to be copied. When monitoring Loki import dashboard files starting with 'loki-'.
|
||||
|
||||
For each of the dashboard files in charts/meta-monitoring/src/dashboards folder do the following:
|
||||
|
||||
1. Click on 'Dashboards' in Grafana
|
||||
|
||||
1. Click on the 'New` button and select 'Import'
|
||||
|
||||
1. Drop the dashboard file to the 'Upload dashboard JSON file' drop area
|
||||
|
||||
1. Click 'Import'
|
||||
|
||||
## Installing the rules on Grafana Cloud
|
||||
|
||||
1. Select the rules files in charts/meta-monitoring/src/rules for the application to monitor. When monitoring Loki use loki-rules.yaml.
|
||||
|
||||
1. Install mimirtool as per the [instructions](https://grafana.com/docs/mimir/latest/manage/tools/mimirtool/)
|
||||
|
||||
1. Create an access policy with Read and Write permission for Rules. Also create a token and record the token.
|
||||
|
||||
1. Get your cloud Prometheus endpoint and Instance ID from the `Prometheus` page in `Stacks`.
|
||||
|
||||
1. Use them to load the rules using mimirtool as follows:
|
||||
|
||||
```
|
||||
mimirtool rules load --address=<your_cloud_prometheus_endpoint> --id=<your_instance_id> --key=<your_cloud_access_policy_token> *.yaml
|
||||
```
|
||||
|
||||
1. To check the rules you have uploaded run:
|
||||
|
||||
```
|
||||
mimirtool rules print --address=<your_cloud_prometheus_endpoint> --id=<your_instance_id> --key=<your_cloud_access_policy_token>
|
||||
```
|
20
scripts/clone_loki_mixin.sh
Executable file
20
scripts/clone_loki_mixin.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
clean_up() {
|
||||
test -d "$tmp_dir" && rm -fr "$tmp_dir"
|
||||
}
|
||||
|
||||
here=${PWD}
|
||||
|
||||
tmp_dir=$( mktemp -d -t my-script )
|
||||
cd $tmp_dir
|
||||
|
||||
echo "Cloning Loki"
|
||||
git clone --filter=blob:none --no-checkout "https://github.com/grafana/loki"
|
||||
cd loki
|
||||
git sparse-checkout init --cone
|
||||
git checkout main
|
||||
git sparse-checkout set production/loki-mixin
|
||||
|
||||
echo "Copying production/loki-mixin to ${here}"
|
||||
cp -r production ${here}
|
18
scripts/mixin-meta-monitoring.libsonnet
Normal file
18
scripts/mixin-meta-monitoring.libsonnet
Normal file
@@ -0,0 +1,18 @@
|
||||
(import 'dashboards.libsonnet') +
|
||||
(import 'alerts.libsonnet') +
|
||||
(import 'recording_rules.libsonnet') + {
|
||||
grafanaDashboardFolder: 'Loki Meta Monitoring',
|
||||
|
||||
_config+:: {
|
||||
internal_components: false,
|
||||
|
||||
// The Meta Monitoring helm chart uses Grafana Alloy instead of promtail
|
||||
promtail+: {
|
||||
enabled: false,
|
||||
},
|
||||
|
||||
meta_monitoring+: {
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
}
|
9
tools/kind.config
Normal file
9
tools/kind.config
Normal file
@@ -0,0 +1,9 @@
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
name: meta
|
||||
nodes:
|
||||
- role: control-plane
|
||||
- role: worker
|
||||
- role: worker
|
||||
- role: worker
|
||||
|
Reference in New Issue
Block a user