forked from RemoteSync/grafana-meta-monitoring-chart
Compare commits
254 Commits
add_valida
...
split_up_g
Author | SHA1 | Date | |
---|---|---|---|
|
57adbf43e2 | ||
|
add43ae974 | ||
|
52ec526718 | ||
|
8a5ed559a2 | ||
|
188cd7e56f | ||
|
9e4dbcd44a | ||
|
28daa27fca | ||
|
2de595baf4 | ||
|
95257b66d3 | ||
|
e9b0e57ef0 | ||
|
03609ebb35 | ||
|
7e38d19814 | ||
|
32272298d7 | ||
|
3879207e05 | ||
|
cd42da2197 | ||
|
56cab04af8 | ||
|
c6d0444dfa | ||
|
b99140d3f4 | ||
|
749e271455 | ||
|
d938dbbfe5 | ||
|
e9125d1a9c | ||
|
076685ef06 | ||
|
b0451d626e | ||
|
90e949e89a | ||
|
06e176e720 | ||
|
d4c886ba9d | ||
|
643e73f5f1 | ||
|
7e65f3d9c9 | ||
|
de91b4dac7 | ||
|
9f6e52d7a1 | ||
|
26e0ad0b85 | ||
|
025bb5b0c3 | ||
|
0b31eae425 | ||
|
ab42a96949 | ||
|
386ff25fca | ||
|
c6889131a7 | ||
|
2739bae0c0 | ||
|
cea8076b75 | ||
|
29b831ca00 | ||
|
09cf8f812c | ||
|
f8436a8e44 | ||
|
2b26abedbb | ||
|
017c041007 | ||
|
e7ad1383a6 | ||
|
2906836eae | ||
|
c70ef27e48 | ||
|
3c187def47 | ||
|
54eda36ec3 | ||
|
bc33e5a2a5 | ||
|
31e82bbf16 | ||
|
52c1bf1778 | ||
|
2c5c4d8e38 | ||
|
b6a5a3cfe3 | ||
|
a01992194b | ||
|
636b654828 | ||
|
5d553e50f6 | ||
|
3f200115f9 | ||
|
f0bdf0760d | ||
|
314b1db19b | ||
|
b547784d54 | ||
|
af4cd1f8c0 | ||
|
116119bdc4 | ||
|
df794115f0 | ||
|
c26e509f65 | ||
|
95f7905e34 | ||
|
ad1b619a33 | ||
|
446c0be743 | ||
|
be7a32de27 | ||
|
e41b2f360f | ||
|
1cafd696c7 | ||
|
c614f41d66 | ||
|
2144cea411 | ||
|
81a017551b | ||
|
1871a4ef87 | ||
|
11d80263a7 | ||
|
cdb0bee56e | ||
|
58171a6a42 | ||
|
c65445384b | ||
|
1f980f393e | ||
|
47d9190eda | ||
|
5ff9bd16c9 | ||
|
d6faaf88f5 | ||
|
2d711f7168 | ||
|
c666bf69c9 | ||
|
41619b99b1 | ||
|
5923139796 | ||
|
329d5822ea | ||
|
5498b27ad6 | ||
|
da687315e7 | ||
|
8f20e45c77 | ||
|
e81b1246f5 | ||
|
b103fb3434 | ||
|
9349d2d906 | ||
|
31536103c8 | ||
|
13c28aa50a | ||
|
385d0dd543 | ||
|
458451922d | ||
|
4b0d457af0 | ||
|
e60b2aecdc | ||
|
6244de677e | ||
|
d14e933e84 | ||
|
0210fba39d | ||
|
a97fa64880 | ||
|
34545e15b4 | ||
|
33b8e37bed | ||
|
0938193982 | ||
|
b1975505e5 | ||
|
c282bf352d | ||
|
60af0b4d19 | ||
|
0980cb2ede | ||
|
75ab1f0d97 | ||
|
dd49623508 | ||
|
095fb09d26 | ||
|
0fc5e2f847 | ||
|
d04d74cc26 | ||
|
b840555522 | ||
|
e1a8495227 | ||
|
a812b4f63a | ||
|
8dde9642c9 | ||
|
0832bc8e8f | ||
|
cbae75acb8 | ||
|
f5a5472b95 | ||
|
58a4696a6b | ||
|
e31f6b0906 | ||
|
351f50e238 | ||
|
170c17b721 | ||
|
b2d06ab8e3 | ||
|
9d63c32d4f | ||
|
a201cef34c | ||
|
ea4d5e278a | ||
|
e3c3f6a094 | ||
|
d6da6fec35 | ||
|
5e2ffb222b | ||
|
9b12bad16c | ||
|
ed31bcf345 | ||
|
a0184e27d0 | ||
|
3491886311 | ||
|
c58b76cfc7 | ||
|
a72e64327f | ||
|
cfdc6b95eb | ||
|
b1ccef91cb | ||
|
f3f970d783 | ||
|
b78571dfdc | ||
|
1859c3a82c | ||
|
f275b2d1b6 | ||
|
fd1aadc099 | ||
|
a6462d1ac1 | ||
|
690cda9eb5 | ||
|
00cad594f4 | ||
|
e74ec96349 | ||
|
0d3f9a1416 | ||
|
8fa5b63db7 | ||
|
d7063da3d4 | ||
|
e7f28a261e | ||
|
509a32bc59 | ||
|
6bb31ad5e0 | ||
|
7724d9c928 | ||
|
13294675fe | ||
|
bf71def2f8 | ||
|
b37fa4adf5 | ||
|
18a5face81 | ||
|
5e908f796c | ||
|
17b52d572a | ||
|
6eac38d4ec | ||
|
3706c702a1 | ||
|
28b77dab17 | ||
|
9770a3e5b3 | ||
|
6cbffd6d9d | ||
|
4ae23a99d2 | ||
|
20232e9cf3 | ||
|
043a503ce7 | ||
|
39f50d8580 | ||
|
d9fc9e4f4e | ||
|
f61913d3da | ||
|
c29daab64d | ||
|
d389a9f741 | ||
|
6f5f50f901 | ||
|
efea1c5054 | ||
|
b02aee6816 | ||
|
c522e3f39e | ||
|
e3542e472d | ||
|
3a138991ff | ||
|
cd78caab48 | ||
|
f281741de9 | ||
|
381ecb2c06 | ||
|
20cdb8dcc1 | ||
|
019f2b7b1e | ||
|
1bffcac5e5 | ||
|
d23291dc91 | ||
|
a89ba944a3 | ||
|
ef05e599e6 | ||
|
a586e753da | ||
|
76908c1e9e | ||
|
bc5cdadb9f | ||
|
687c77c0f6 | ||
|
2a0b14ee45 | ||
|
7e06d611a7 | ||
|
f4934d6007 | ||
|
427764278c | ||
|
1093e91741 | ||
|
1ed196299b | ||
|
faa0015c11 | ||
|
53416e042c | ||
|
d804da13f1 | ||
|
8c0b68fe02 | ||
|
99bb8f13c2 | ||
|
26ff679cbb | ||
|
fb3e3ece1b | ||
|
7a5358b322 | ||
|
9c92e18efe | ||
|
ffe220590d | ||
|
e3708ce3fe | ||
|
3149f4df9b | ||
|
86ec586917 | ||
|
6cd12bee01 | ||
|
b042b396a2 | ||
|
bcacb70e2d | ||
|
d9c3b60659 | ||
|
6d091d564e | ||
|
8671993962 | ||
|
f80c9d7c43 | ||
|
60853bc8b0 | ||
|
debdd67283 | ||
|
8bc465b2e6 | ||
|
18d24c39f7 | ||
|
23d14110a0 | ||
|
092423c2b3 | ||
|
dcbe85a37a | ||
|
db8558982c | ||
|
49034b9f6b | ||
|
15f96d9cc6 | ||
|
b9f22ba27e | ||
|
2f79e7ef14 | ||
|
aa988adb47 | ||
|
6fb22ae671 | ||
|
d3878e1516 | ||
|
8ae136e0c4 | ||
|
ac3e4462f9 | ||
|
e9aab491db | ||
|
c95c0e2ca9 | ||
|
c288a80bd4 | ||
|
93cac45b2e | ||
|
6ce4be70e2 | ||
|
176312167c | ||
|
07a336d9ed | ||
|
db493fbb39 | ||
|
f4d5bcc018 | ||
|
18f0dc932a | ||
|
d999ef0110 | ||
|
9dd6584bee | ||
|
04cf591478 | ||
|
9f54397e83 | ||
|
fa2b01708c | ||
|
a1cd5d36b0 |
3
.github/configs/cr.yaml
vendored
Normal file
3
.github/configs/cr.yaml
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
owner: grafana
|
||||||
|
git-repo: helm-charts
|
||||||
|
skip-existing: true
|
15
.github/configs/ct.yaml
vendored
Normal file
15
.github/configs/ct.yaml
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
## Reference: https://github.com/helm/chart-testing/blob/master/doc/ct_lint-and-install.md
|
||||||
|
remote: origin
|
||||||
|
target-branch: main
|
||||||
|
chart-dirs:
|
||||||
|
- charts
|
||||||
|
chart-repos:
|
||||||
|
- grafana=https://grafana.github.io/helm-charts
|
||||||
|
- minio=https://charts.min.io
|
||||||
|
validate-chart-schema: true
|
||||||
|
validate-maintainers: true
|
||||||
|
validate-yaml: true
|
||||||
|
exclude-deprecated: true
|
||||||
|
excluded-charts: []
|
||||||
|
namespace: meta-monitoring # Need to set the namespace because we create the secret there
|
||||||
|
release-label: app.kubernetes.io/instance
|
30
.github/configs/updatecli.d/alloy.yaml
vendored
Normal file
30
.github/configs/updatecli.d/alloy.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
name: Bump dependency "alloy" for Helm chart "meta-monitoring"
|
||||||
|
sources:
|
||||||
|
alloy:
|
||||||
|
name: Get latest "alloy" Helm chart version
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
name: alloy
|
||||||
|
url: https://grafana.github.io/helm-charts
|
||||||
|
versionfilter:
|
||||||
|
kind: semver
|
||||||
|
pattern: '*'
|
||||||
|
conditions:
|
||||||
|
alloy:
|
||||||
|
name: Ensure Helm chart dependency "alloy" is specified
|
||||||
|
kind: yaml
|
||||||
|
spec:
|
||||||
|
file: charts/meta-monitoring/Chart.yaml
|
||||||
|
key: $.dependencies[1].name
|
||||||
|
value: alloy
|
||||||
|
disablesourceinput: true
|
||||||
|
targets:
|
||||||
|
alloy:
|
||||||
|
name: Bump Helm chart dependency "alloy" for Helm chart "meta-monitoring"
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
file: Chart.yaml
|
||||||
|
key: $.dependencies[1].version
|
||||||
|
name: charts/meta-monitoring
|
||||||
|
versionincrement: none
|
||||||
|
sourceid: alloy
|
30
.github/configs/updatecli.d/grafana.yaml
vendored
Normal file
30
.github/configs/updatecli.d/grafana.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
name: Bump grafana version specified in the values.yaml
|
||||||
|
sources:
|
||||||
|
latestGrafanaRelease:
|
||||||
|
name: Get latest grafana release on Github
|
||||||
|
kind: githubrelease
|
||||||
|
spec:
|
||||||
|
owner: grafana
|
||||||
|
repository: grafana
|
||||||
|
token: '{{ requiredEnv "UPDATECLI_GITHUB_TOKEN" }}'
|
||||||
|
versionfilter:
|
||||||
|
kind: latest
|
||||||
|
transformers:
|
||||||
|
- trimprefix: "v"
|
||||||
|
conditions:
|
||||||
|
grafanaImagePublished:
|
||||||
|
name: Ensure the latest Grafana is published on DockerHub
|
||||||
|
kind: dockerimage
|
||||||
|
source-id: latestGrafanaRelease
|
||||||
|
spec:
|
||||||
|
image: "grafana/grafana"
|
||||||
|
targets:
|
||||||
|
grafana:
|
||||||
|
name: Update Grafana version in values.yaml
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
file: values.yaml
|
||||||
|
key: $.grafana.version
|
||||||
|
name: charts/meta-monitoring
|
||||||
|
versionincrement: none
|
||||||
|
sourceid: latestGrafanaRelease
|
30
.github/configs/updatecli.d/loki.yaml
vendored
Normal file
30
.github/configs/updatecli.d/loki.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
name: Bump dependency "loki" for Helm chart "meta-monitoring"
|
||||||
|
sources:
|
||||||
|
loki:
|
||||||
|
name: Get latest "loki" Helm chart version
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
name: loki
|
||||||
|
url: https://grafana.github.io/helm-charts
|
||||||
|
versionfilter:
|
||||||
|
kind: semver
|
||||||
|
pattern: '*'
|
||||||
|
conditions:
|
||||||
|
loki:
|
||||||
|
name: Ensure Helm chart dependency "loki" is specified
|
||||||
|
kind: yaml
|
||||||
|
spec:
|
||||||
|
file: charts/meta-monitoring/Chart.yaml
|
||||||
|
key: $.dependencies[0].name
|
||||||
|
value: loki
|
||||||
|
disablesourceinput: true
|
||||||
|
targets:
|
||||||
|
loki:
|
||||||
|
name: Bump Helm chart dependency "loki" for Helm chart "meta-monitoring"
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
file: Chart.yaml
|
||||||
|
key: $.dependencies[0].version
|
||||||
|
name: charts/meta-monitoring
|
||||||
|
versionincrement: none
|
||||||
|
sourceid: loki
|
30
.github/configs/updatecli.d/mimir-distributed.yaml
vendored
Normal file
30
.github/configs/updatecli.d/mimir-distributed.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
name: Bump dependency "mimir-distributed" for Helm chart "meta-monitoring"
|
||||||
|
sources:
|
||||||
|
mimir-distributed:
|
||||||
|
name: Get latest "mimir-distributed" Helm chart version
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
name: mimir-distributed
|
||||||
|
url: https://grafana.github.io/helm-charts
|
||||||
|
versionfilter:
|
||||||
|
kind: semver
|
||||||
|
pattern: '*'
|
||||||
|
conditions:
|
||||||
|
mimir-distributed:
|
||||||
|
name: Ensure Helm chart dependency "mimir-distributed" is specified
|
||||||
|
kind: yaml
|
||||||
|
spec:
|
||||||
|
file: charts/meta-monitoring/Chart.yaml
|
||||||
|
key: $.dependencies[2].name
|
||||||
|
value: mimir-distributed
|
||||||
|
disablesourceinput: true
|
||||||
|
targets:
|
||||||
|
mimir-distributed:
|
||||||
|
name: Bump Helm chart dependency "mimir-distributed" for Helm chart "meta-monitoring"
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
file: Chart.yaml
|
||||||
|
key: $.dependencies[2].version
|
||||||
|
name: charts/meta-monitoring
|
||||||
|
versionincrement: none
|
||||||
|
sourceid: mimir-distributed
|
30
.github/configs/updatecli.d/minio.yaml
vendored
Normal file
30
.github/configs/updatecli.d/minio.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
name: Bump dependency "minio" for Helm chart "meta-monitoring"
|
||||||
|
sources:
|
||||||
|
minio:
|
||||||
|
name: Get latest "minio" Helm chart version
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
name: minio
|
||||||
|
url: https://charts.min.io
|
||||||
|
versionfilter:
|
||||||
|
kind: semver
|
||||||
|
pattern: '*'
|
||||||
|
conditions:
|
||||||
|
minio:
|
||||||
|
name: Ensure Helm chart dependency "minio" is specified
|
||||||
|
kind: yaml
|
||||||
|
spec:
|
||||||
|
file: charts/meta-monitoring/Chart.yaml
|
||||||
|
key: $.dependencies[4].name
|
||||||
|
value: minio
|
||||||
|
disablesourceinput: true
|
||||||
|
targets:
|
||||||
|
minio:
|
||||||
|
name: Bump Helm chart dependency "minio" for Helm chart "meta-monitoring"
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
file: Chart.yaml
|
||||||
|
key: $.dependencies[4].version
|
||||||
|
name: charts/meta-monitoring
|
||||||
|
versionincrement: none
|
||||||
|
sourceid: minio
|
30
.github/configs/updatecli.d/tempo-distributed.yaml
vendored
Normal file
30
.github/configs/updatecli.d/tempo-distributed.yaml
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
name: Bump dependency "tempo-distributed" for Helm chart "meta-monitoring"
|
||||||
|
sources:
|
||||||
|
tempo-distributed:
|
||||||
|
name: Get latest "tempo-distributed" Helm chart version
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
name: tempo-distributed
|
||||||
|
url: https://grafana.github.io/helm-charts
|
||||||
|
versionfilter:
|
||||||
|
kind: semver
|
||||||
|
pattern: '*'
|
||||||
|
conditions:
|
||||||
|
tempo-distributed:
|
||||||
|
name: Ensure Helm chart dependency "tempo-distributed" is specified
|
||||||
|
kind: yaml
|
||||||
|
spec:
|
||||||
|
file: charts/meta-monitoring/Chart.yaml
|
||||||
|
key: $.dependencies[3].name
|
||||||
|
value: tempo-distributed
|
||||||
|
disablesourceinput: true
|
||||||
|
targets:
|
||||||
|
tempo-distributed:
|
||||||
|
name: Bump Helm chart dependency "tempo-distributed" for Helm chart "meta-monitoring"
|
||||||
|
kind: helmchart
|
||||||
|
spec:
|
||||||
|
file: Chart.yaml
|
||||||
|
key: $.dependencies[3].version
|
||||||
|
name: charts/meta-monitoring
|
||||||
|
versionincrement: none
|
||||||
|
sourceid: tempo-distributed
|
113
.github/workflows/check-for-dependency-updates.yaml
vendored
Normal file
113
.github/workflows/check-for-dependency-updates.yaml
vendored
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
---
|
||||||
|
name: Check for dependency updates
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
schedule:
|
||||||
|
# Run once a day
|
||||||
|
- cron: '0 7 * * *'
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: "write"
|
||||||
|
pull-requests: "write"
|
||||||
|
|
||||||
|
env:
|
||||||
|
UPDATECLI_CONFIG_DIR: "${{ github.workspace }}/.github/configs/updatecli.d"
|
||||||
|
UPDATECLI_GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
updateVersions:
|
||||||
|
name: Update the subcharts
|
||||||
|
runs-on: "ubuntu-latest"
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: Install Updatecli
|
||||||
|
uses: updatecli/updatecli-action@v2
|
||||||
|
|
||||||
|
- name: Run Updatecli for Loki
|
||||||
|
id: update-loki
|
||||||
|
run: |
|
||||||
|
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/loki.yaml
|
||||||
|
if ! git diff --exit-code > /dev/null; then
|
||||||
|
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Run Updatecli for Alloy
|
||||||
|
id: update-grafana-alloy
|
||||||
|
run: |
|
||||||
|
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/alloy.yaml
|
||||||
|
if ! git diff --exit-code > /dev/null; then
|
||||||
|
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Run Updatecli for Mimir
|
||||||
|
id: update-mimir-distributed
|
||||||
|
run: |
|
||||||
|
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/mimir-distributed.yaml
|
||||||
|
if ! git diff --exit-code > /dev/null; then
|
||||||
|
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Run Updatecli for Tempo
|
||||||
|
id: update-tempo-distributed
|
||||||
|
run: |
|
||||||
|
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/tempo-distributed.yaml
|
||||||
|
if ! git diff --exit-code > /dev/null; then
|
||||||
|
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Run Updatecli for Minio
|
||||||
|
id: update-minio
|
||||||
|
run: |
|
||||||
|
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/minio.yaml
|
||||||
|
if ! git diff --exit-code > /dev/null; then
|
||||||
|
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Create pull request
|
||||||
|
if: steps.update-loki.outputs.changed == 'true' || steps.update-grafana-alloy.outputs.changed == 'true' || steps.update-mimir-distributed.outputs.changed == 'true' || steps.update-tempo-distributed.outputs.changed == 'true' || steps.update-minio.outputs.changed == 'true'
|
||||||
|
uses: peter-evans/create-pull-request@v5
|
||||||
|
with:
|
||||||
|
title: "[dependency] Update the subcharts"
|
||||||
|
body: "Updates the subcharts"
|
||||||
|
base: main
|
||||||
|
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
|
||||||
|
committer: "GitHub <noreply@github.com>"
|
||||||
|
commit-message: Update dependencies
|
||||||
|
labels: dependencies
|
||||||
|
branch: chore/update-dependencies
|
||||||
|
delete-branch: true
|
||||||
|
|
||||||
|
updateGrafana:
|
||||||
|
name: Update the Grafana version
|
||||||
|
runs-on: "ubuntu-latest"
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: Install Updatecli
|
||||||
|
uses: updatecli/updatecli-action@v2
|
||||||
|
|
||||||
|
- name: Run Updatecli
|
||||||
|
id: update-grafana
|
||||||
|
run: |
|
||||||
|
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/grafana.yaml
|
||||||
|
if ! git diff --exit-code > /dev/null; then
|
||||||
|
echo "changed=true" >> "${GITHUB_OUTPUT}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Create pull request
|
||||||
|
if: steps.update-grafana.outputs.changed == 'true'
|
||||||
|
uses: peter-evans/create-pull-request@v5
|
||||||
|
with:
|
||||||
|
title: "[dependency] Update the Grafana version"
|
||||||
|
body: "Updates the Grafana version"
|
||||||
|
base: main
|
||||||
|
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
|
||||||
|
committer: "GitHub <noreply@github.com>"
|
||||||
|
commit-message: Update Grafana version
|
||||||
|
labels: dependencies
|
||||||
|
branch: chore/update-minio
|
||||||
|
delete-branch: true
|
66
.github/workflows/helm-ci.yml
vendored
Normal file
66
.github/workflows/helm-ci.yml
vendored
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
---
|
||||||
|
name: helm-ci
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- "charts/meta-monitoring/**"
|
||||||
|
|
||||||
|
env:
|
||||||
|
CT_CONFIGFILE: charts/meta-monitoring/ct.yaml
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
call-lint:
|
||||||
|
name: Lint Helm Chart
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout Code
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Lint Yaml
|
||||||
|
run: make helm-lint
|
||||||
|
|
||||||
|
# call-test:
|
||||||
|
# name: Test Helm Chart
|
||||||
|
# runs-on: ubuntu-latest
|
||||||
|
# steps:
|
||||||
|
# - name: Checkout
|
||||||
|
# uses: actions/checkout@v3
|
||||||
|
# with:
|
||||||
|
# fetch-depth: 0
|
||||||
|
|
||||||
|
# - name: Set up Helm
|
||||||
|
# uses: azure/setup-helm@v3
|
||||||
|
# with:
|
||||||
|
# version: v3.8.2
|
||||||
|
|
||||||
|
# # Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and
|
||||||
|
# # yamllint (https://github.com/adrienverge/yamllint) which require Python
|
||||||
|
# - name: Set up Python
|
||||||
|
# uses: actions/setup-python@v4
|
||||||
|
# with:
|
||||||
|
# python-version: 3.7
|
||||||
|
|
||||||
|
# - name: Set up chart-testing
|
||||||
|
# uses: helm/chart-testing-action@v2.4.0
|
||||||
|
|
||||||
|
# - name: Run chart-testing (list-changed)
|
||||||
|
# id: list-changed
|
||||||
|
# run: |
|
||||||
|
# changed=$(ct list-changed --config "${CT_CONFIGFILE}")
|
||||||
|
# if [[ -n "$changed" ]]; then
|
||||||
|
# echo "changed=true" >> $GITHUB_OUTPUT
|
||||||
|
# fi
|
||||||
|
|
||||||
|
# - name: Run chart-testing (lint)
|
||||||
|
# run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false
|
||||||
|
|
||||||
|
# - name: Create kind cluster
|
||||||
|
# uses: helm/kind-action@v1.8.0
|
||||||
|
# if: steps.list-changed.outputs.changed == 'true'
|
||||||
|
# with:
|
||||||
|
# config: tools/kind.config
|
||||||
|
|
||||||
|
# - name: Run chart-testing (install)
|
||||||
|
# run: |
|
||||||
|
# changed=$(ct list-changed --config "${CT_CONFIGFILE}")
|
||||||
|
# ct install --config "${CT_CONFIGFILE}"
|
175
.github/workflows/helm-release.yml
vendored
Normal file
175
.github/workflows/helm-release.yml
vendored
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
name: Release Helm chart
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
env:
|
||||||
|
CR_CONFIGFILE: "${{ github.workspace }}/source/.github/configs/cr.yaml"
|
||||||
|
CT_CONFIGFILE: "${{ github.workspace }}/source/.github/configs/ct.yaml"
|
||||||
|
CR_INDEX_PATH: "${{ github.workspace }}/.cr-index"
|
||||||
|
CR_PACKAGE_PATH: "${{ github.workspace }}/.cr-release-packages"
|
||||||
|
CR_TOOL_PATH: "${{ github.workspace }}/.cr-tool"
|
||||||
|
CR_VERSION: "1.5.0"
|
||||||
|
jobs:
|
||||||
|
setup:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
changed: ${{ steps.list-changed.outputs.changed }}
|
||||||
|
chartpath: ${{ steps.list-changed.outputs.chartpath }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
path: source
|
||||||
|
|
||||||
|
- name: Install chart-testing
|
||||||
|
uses: helm/chart-testing-action@v2
|
||||||
|
|
||||||
|
- name: List changed charts
|
||||||
|
id: list-changed
|
||||||
|
run: |
|
||||||
|
cd source
|
||||||
|
|
||||||
|
latest_tag=$( if ! git describe --tags --abbrev=0 --match='helm-chart/*' 2> /dev/null ; then git rev-list --max-parents=0 --first-parent HEAD; fi )
|
||||||
|
|
||||||
|
echo "Running: ct list-changed --config ${CT_CONFIGFILE} --since ${latest_tag} --target-branch ${{ github.ref_name }}"
|
||||||
|
changed=$(ct list-changed --config "${CT_CONFIGFILE}" --since "${latest_tag}" --target-branch "${{ github.ref_name }}")
|
||||||
|
echo "${changed}"
|
||||||
|
|
||||||
|
num_changed=$(wc -l <<< ${changed})
|
||||||
|
if [[ "${num_changed}" -gt "1" ]] ; then
|
||||||
|
echo "More than one chart changed, exiting"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [[ -n "${changed}" ]]; then
|
||||||
|
name=$(yq ".name" < ${changed}/Chart.yaml)
|
||||||
|
version=$(yq ".version" < ${changed}/Chart.yaml)
|
||||||
|
tagname="v${version}"
|
||||||
|
|
||||||
|
if [ $(git tag -l "${tagname}") ]; then
|
||||||
|
echo "Tag ${tagname} already exists, skipping release"
|
||||||
|
echo "changed=false" >> $GITHUB_OUTPUT
|
||||||
|
else
|
||||||
|
echo "Releasing ${changed}"
|
||||||
|
echo "changed=true" >> $GITHUB_OUTPUT
|
||||||
|
echo "chartpath=${changed}" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "No charts have changed, skipping release"
|
||||||
|
echo "changed=false" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
|
||||||
|
release:
|
||||||
|
needs: [setup]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: needs.setup.outputs.changed == 'true'
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
id-token: write
|
||||||
|
steps:
|
||||||
|
- id: get-secrets
|
||||||
|
uses: grafana/shared-workflows/actions/get-vault-secrets@main
|
||||||
|
with:
|
||||||
|
# Secrets placed in the ci/repo/grafana/<repo>/<path> path in Vault
|
||||||
|
repo_secrets: |
|
||||||
|
APP_ID=github-app:app-id
|
||||||
|
PRIVATE_KEY=github-app:private-key
|
||||||
|
- uses: actions/create-github-app-token@v1
|
||||||
|
id: app-token
|
||||||
|
with:
|
||||||
|
app-id: ${{ env.APP_ID }}
|
||||||
|
private-key: ${{ env.PRIVATE_KEY }}
|
||||||
|
owner: ${{ github.repository_owner }}
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
path: source
|
||||||
|
|
||||||
|
- name: Configure Git
|
||||||
|
run: |
|
||||||
|
cd source
|
||||||
|
git config user.name "$GITHUB_ACTOR"
|
||||||
|
git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
|
||||||
|
|
||||||
|
- name: Checkout helm-charts
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
repository: grafana/helm-charts
|
||||||
|
path: helm-charts
|
||||||
|
token: "${{ steps.app-token.outputs.token }}"
|
||||||
|
|
||||||
|
- name: Configure Git for helm-charts
|
||||||
|
run: |
|
||||||
|
cd helm-charts
|
||||||
|
git config user.name "$GITHUB_ACTOR"
|
||||||
|
git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
|
||||||
|
|
||||||
|
- name: Set up Helm
|
||||||
|
uses: azure/setup-helm@v4
|
||||||
|
|
||||||
|
- name: Parse Chart.yaml
|
||||||
|
id: parse-chart
|
||||||
|
run: |
|
||||||
|
cd source
|
||||||
|
changed="${{ needs.setup.outputs.chartpath }}"
|
||||||
|
description=$(yq ".description" < ${changed}/Chart.yaml)
|
||||||
|
name=$(yq ".name" < ${changed}/Chart.yaml)
|
||||||
|
version=$(yq ".version" < ${changed}/Chart.yaml)
|
||||||
|
|
||||||
|
echo "chartpath=${changed}" >> $GITHUB_OUTPUT
|
||||||
|
echo "desc=${description}" >> $GITHUB_OUTPUT
|
||||||
|
echo "tagname=v${version}" >> $GITHUB_OUTPUT
|
||||||
|
echo "packagename=${name}-${version}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Install CR tool
|
||||||
|
run: |
|
||||||
|
mkdir "${CR_TOOL_PATH}"
|
||||||
|
mkdir "${CR_PACKAGE_PATH}"
|
||||||
|
mkdir "${CR_INDEX_PATH}"
|
||||||
|
curl -sSLo cr.tar.gz "https://github.com/helm/chart-releaser/releases/download/v${CR_VERSION}/chart-releaser_${CR_VERSION}_linux_amd64.tar.gz"
|
||||||
|
tar -xzf cr.tar.gz -C "${CR_TOOL_PATH}"
|
||||||
|
rm -f cr.tar.gz
|
||||||
|
|
||||||
|
- name: Create Helm package
|
||||||
|
run: |
|
||||||
|
cd source
|
||||||
|
helm repo add grafana https://grafana.github.io/helm-charts
|
||||||
|
helm repo add minio https://charts.min.io
|
||||||
|
|
||||||
|
"${CR_TOOL_PATH}/cr" package "${{ steps.parse-chart.outputs.chartpath }}" --config "${CR_CONFIGFILE}" --package-path "${CR_PACKAGE_PATH}"
|
||||||
|
|
||||||
|
- name: Make a release on this repo
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
name: ${{ steps.parse-chart.outputs.tagname }}
|
||||||
|
repository: grafana/meta-monitoring-chart
|
||||||
|
tag_name: ${{ steps.parse-chart.outputs.tagname }}
|
||||||
|
token: ${{ steps.app-token.outputs.token }}
|
||||||
|
generate_release_notes: true
|
||||||
|
files: |
|
||||||
|
${{ env.CR_PACKAGE_PATH }}/${{ steps.parse-chart.outputs.packagename }}.tgz
|
||||||
|
|
||||||
|
# Note that this creates a release in grafana/helm-charts with a new tag.
|
||||||
|
# The tag name in grafana/helm-charts is <package>-<version>, while the
|
||||||
|
# tag name for grafana/meta-monitoring-chart is <version>.
|
||||||
|
- name: Make release on Helm Charts
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
name: ${{ steps.parse-chart.outputs.packagename }}
|
||||||
|
repository: grafana/helm-charts
|
||||||
|
tag_name: ${{ steps.parse-chart.outputs.packagename }}
|
||||||
|
token: ${{ steps.app-token.outputs.token }}
|
||||||
|
body: |
|
||||||
|
${{ steps.parse-chart.outputs.desc }}
|
||||||
|
|
||||||
|
Source commit: https://github.com/${{ github.repository }}/commit/${{ github.sha }}
|
||||||
|
|
||||||
|
Tag on source: https://github.com/${{ github.repository }}/releases/tag/${{ steps.parse-chart.outputs.tagname }}
|
||||||
|
files: |
|
||||||
|
${{ env.CR_PACKAGE_PATH }}/${{ steps.parse-chart.outputs.packagename }}.tgz
|
||||||
|
|
||||||
|
- name: Update helm-charts index.yaml
|
||||||
|
run: |
|
||||||
|
cd helm-charts
|
||||||
|
"${CR_TOOL_PATH}/cr" index --config "${CR_CONFIGFILE}" --token "${{ steps.app-token.outputs.token }}" --index-path "${CR_INDEX_PATH}" --package-path "${CR_PACKAGE_PATH}" --push
|
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
.DS_Store
|
10
Makefile
Normal file
10
Makefile
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Adapted from https://www.thapaliya.com/en/writings/well-documented-makefiles/
|
||||||
|
.PHONY: help
|
||||||
|
help: ## Display this help and any documented user-facing targets. Other undocumented targets may be present in the Makefile.
|
||||||
|
help:
|
||||||
|
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make <target>\n\nTargets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf " %-45s %s\n", $$1, $$2 }' $(MAKEFILE_LIST)
|
||||||
|
|
||||||
|
.PHONY: helm-lint
|
||||||
|
|
||||||
|
helm-lint: ## Run helm linter
|
||||||
|
$(MAKE) -BC charts/meta-monitoring lint
|
49
README.md
49
README.md
@@ -1,43 +1,50 @@
|
|||||||
# meta-monitoring-chart
|
# meta-monitoring-chart
|
||||||
|
|
||||||
This is a meta-monitoring chart for GEL, GEM and GET. It should be installed in a
|
This is a meta-monitoring chart for Loki.
|
||||||
separate namespace next to GEM, GEL or GET installations.
|
|
||||||
|
|
||||||
## Preparation
|
Note that this is pre-production software at the moment.
|
||||||
|
|
||||||
Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml).
|
|
||||||
|
|
||||||
1. Add or remove the namespaces to monitor in the `namespacesToMonitor` setting
|
|
||||||
|
|
||||||
1. Set the cluster name in the `clusterName` setting. This will be added as a label to all logs, metrics and traces.
|
|
||||||
|
|
||||||
1. Create a `meta` namespace.
|
|
||||||
|
|
||||||
## Local and cloud modes
|
## Local and cloud modes
|
||||||
|
|
||||||
The chart has 2 modes: local and cloud. In the local mode logs, metrics and traces are sent
|
The chart has 2 modes: local and cloud. In the local mode logs, metrics and/or traces are sent
|
||||||
to small Loki, Mimir and Tempo installations running in the meta-monitoring namespace.
|
to small Loki, Mimir and Tempo installations running in the meta-monitoring namespace.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
To enable local mode set `local.enabled` to true.
|
To enable local mode set `local.<logs|metrics|traces>.enabled` to true.
|
||||||
|
|
||||||
In the cloud mode the logs, metrics and traces are sent to
|
In the cloud mode the logs, metrics and/or traces are sent to Grafana Cloud.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
To enable cloud mode set `cloud.enabled` to true. The `endpoint`, `username` and `password` settings for your Grafana Cloud logs, metrics and traces instances have to be filled in as well.
|
To enable cloud mode set `cloud.<logs|metrics|traces>.enabled` to true. The `endpoint`, `username` and `password` settings for your Grafana Cloud logs, metrics and traces instances have to be filled in as well.
|
||||||
|
|
||||||
Both modes can be enabled at the same time.
|
Both modes can be enabled at the same time. Cloud mode is preferred.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
```
|
|
||||||
helm install -n meta -f values.yaml meta ./charts/meta-monitoring
|
|
||||||
```
|
|
||||||
|
|
||||||
For more instructions including how to update the chart go to the [installation](docs/installation.md) page.
|
For more instructions including how to update the chart go to the [installation](docs/installation.md) page.
|
||||||
|
|
||||||
|
## Supported features
|
||||||
|
|
||||||
|
- Specify which namespaces are monitored
|
||||||
|
- Specify if logs, metrics or traces should be enabled for cloud or local
|
||||||
|
- Specify the cluster name used for the logs, metrics and traces
|
||||||
|
- Specify PII regexes that are applied to logs before they are sent to Loki (cloud or local). The capture group in the regex is replaced with *****.
|
||||||
|
- a Grafana instance is installed (when local mode is used) with the relevant datasources installed. The following dashboards are installed:
|
||||||
|
- logs dashboards
|
||||||
|
- agent dashboards
|
||||||
|
- Retention is set to 24 hours
|
||||||
|
|
||||||
|
Most of these features are enabled by default. See the values.yaml file for how to enable/disable them.
|
||||||
|
|
||||||
|
## Caveats
|
||||||
|
|
||||||
|
- This has not been tested on Openshift yet.
|
||||||
|
- The underlying Loki, Mimir and Tempo are at the default size installed by the Helm chart. This might need changing when monitoring bigger Loki, Mimir or Tempo installations.
|
||||||
|
- MinIO is used as storage at the moment with a limited retention. At the moment this chart cannot be used for monitoring over longer periods.
|
||||||
|
- Agent self monitoring is not done at the moment.
|
||||||
|
|
||||||
## Developer help topics
|
## Developer help topics
|
||||||
|
|
||||||
- [update dependencies](docs/dev_update_dependencies.md)
|
- [update dependencies](docs/dev_update_dependencies.md)
|
@@ -1,18 +1,18 @@
|
|||||||
dependencies:
|
dependencies:
|
||||||
- name: loki
|
- name: loki
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: 5.8.0
|
version: 6.5.0
|
||||||
- name: grafana-agent
|
- name: alloy
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: 0.15.0
|
version: 0.1.1
|
||||||
- name: mimir-distributed
|
- name: mimir-distributed
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: 4.4.1
|
version: 5.3.0
|
||||||
- name: tempo-distributed
|
- name: tempo-distributed
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: 1.4.7
|
version: 1.9.9
|
||||||
- name: minio
|
- name: minio
|
||||||
repository: https://charts.min.io
|
repository: https://charts.min.io
|
||||||
version: 5.0.11
|
version: 5.2.0
|
||||||
digest: sha256:4b04084e6fe821c4d481017b2430f7c8cd782a5d60830dd3a24eb8f10a9ece09
|
digest: sha256:5328702b5f6b0487aba8f7bc77d6abfcd5e094569e9205cd725971e3e31255dd
|
||||||
generated: "2023-06-29T14:25:07.247853+01:00"
|
generated: "2024-05-08T07:03:21.797461955Z"
|
||||||
|
@@ -1,7 +1,6 @@
|
|||||||
apiVersion: v2
|
apiVersion: v2
|
||||||
name: meta-monitoring
|
name: meta-monitoring
|
||||||
description: A Helm chart for meta monitoring Grafana Loki, Mimir and Tempo
|
description: A Helm chart for meta monitoring Grafana Loki, Mimir and Tempo
|
||||||
|
|
||||||
# A chart can be either an 'application' or a 'library' chart.
|
# A chart can be either an 'application' or a 'library' chart.
|
||||||
#
|
#
|
||||||
# Application charts are a collection of templates that can be packaged into versioned archives
|
# Application charts are a collection of templates that can be packaged into versioned archives
|
||||||
@@ -11,35 +10,32 @@ description: A Helm chart for meta monitoring Grafana Loki, Mimir and Tempo
|
|||||||
# a dependency of application charts to inject those utilities and functions into the rendering
|
# a dependency of application charts to inject those utilities and functions into the rendering
|
||||||
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
||||||
type: application
|
type: application
|
||||||
|
|
||||||
# This is the chart version. This version number should be incremented each time you make changes
|
# This is the chart version. This version number should be incremented each time you make changes
|
||||||
# to the chart and its templates, including the app version.
|
# to the chart and its templates, including the app version.
|
||||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||||
version: 0.0.1
|
version: 0.0.2
|
||||||
|
|
||||||
# This is the version number of the application being deployed. This version number should be
|
# This is the version number of the application being deployed. This version number should be
|
||||||
# incremented each time you make changes to the application. Versions are not expected to
|
# incremented each time you make changes to the application. Versions are not expected to
|
||||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||||
# It is recommended to use it with quotes.
|
# It is recommended to use it with quotes.
|
||||||
appVersion: "0.0.1"
|
appVersion: "0.0.1"
|
||||||
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- name: loki
|
- name: loki
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: "5.8.0"
|
version: 6.5.0
|
||||||
condition: local.logs.enabled
|
condition: local.logs.enabled
|
||||||
- name: grafana-agent
|
- name: alloy
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: "0.15.0"
|
version: 0.1.1
|
||||||
- name: mimir-distributed
|
- name: mimir-distributed
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: "4.4.1"
|
version: 5.3.0
|
||||||
condition: local.metrics.enabled
|
condition: local.metrics.enabled
|
||||||
- name: tempo-distributed
|
- name: tempo-distributed
|
||||||
repository: https://grafana.github.io/helm-charts
|
repository: https://grafana.github.io/helm-charts
|
||||||
version: "1.4.7"
|
version: 1.9.9
|
||||||
condition: local.traces.enabled
|
condition: local.traces.enabled
|
||||||
- name: minio
|
- name: minio
|
||||||
repository: https://charts.min.io
|
repository: https://charts.min.io
|
||||||
version: "5.0.11"
|
version: 5.2.0
|
||||||
condition: local.minio.enabled
|
condition: local.minio.enabled
|
||||||
|
7
charts/meta-monitoring/Makefile
Normal file
7
charts/meta-monitoring/Makefile
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
.DEFAULT_GOAL := lint
|
||||||
|
.PHONY: lint lint-yaml
|
||||||
|
|
||||||
|
lint: lint-yaml
|
||||||
|
|
||||||
|
lint-yaml:
|
||||||
|
yamllint -c $(CURDIR)/src/.yamllint.yaml $(CURDIR)/src
|
BIN
charts/meta-monitoring/charts/alloy-0.1.1.tgz
Normal file
BIN
charts/meta-monitoring/charts/alloy-0.1.1.tgz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/loki-6.5.0.tgz
Normal file
BIN
charts/meta-monitoring/charts/loki-6.5.0.tgz
Normal file
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/mimir-distributed-5.3.0.tgz
Normal file
BIN
charts/meta-monitoring/charts/mimir-distributed-5.3.0.tgz
Normal file
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/minio-5.2.0.tgz
Normal file
BIN
charts/meta-monitoring/charts/minio-5.2.0.tgz
Normal file
Binary file not shown.
Binary file not shown.
BIN
charts/meta-monitoring/charts/tempo-distributed-1.9.9.tgz
Normal file
BIN
charts/meta-monitoring/charts/tempo-distributed-1.9.9.tgz
Normal file
Binary file not shown.
11
charts/meta-monitoring/ct.yaml
Normal file
11
charts/meta-monitoring/ct.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
remote: origin
|
||||||
|
target-branch: main
|
||||||
|
chart-dirs:
|
||||||
|
- charts
|
||||||
|
chart-repos:
|
||||||
|
- grafana=https://grafana.github.io/helm-charts
|
||||||
|
- minio=https://charts.min.io
|
||||||
|
helm-extra-args: --timeout 1200s
|
||||||
|
check-version-increment: false
|
||||||
|
validate-maintainers: false
|
4
charts/meta-monitoring/src/.yamllint.yaml
Normal file
4
charts/meta-monitoring/src/.yamllint.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
rules:
|
||||||
|
quoted-strings:
|
||||||
|
required: true
|
1082
charts/meta-monitoring/src/dashboards/agent-logs-pipeline.json
Normal file
1082
charts/meta-monitoring/src/dashboards/agent-logs-pipeline.json
Normal file
File diff suppressed because it is too large
Load Diff
1189
charts/meta-monitoring/src/dashboards/agent-operational.json
Normal file
1189
charts/meta-monitoring/src/dashboards/agent-operational.json
Normal file
File diff suppressed because it is too large
Load Diff
1512
charts/meta-monitoring/src/dashboards/agent-remote-write.json
Normal file
1512
charts/meta-monitoring/src/dashboards/agent-remote-write.json
Normal file
File diff suppressed because it is too large
Load Diff
1065
charts/meta-monitoring/src/dashboards/agent-tracing-pipeline.json
Normal file
1065
charts/meta-monitoring/src/dashboards/agent-tracing-pipeline.json
Normal file
File diff suppressed because it is too large
Load Diff
786
charts/meta-monitoring/src/dashboards/agent.json
Normal file
786
charts/meta-monitoring/src/dashboards/agent.json
Normal file
@@ -0,0 +1,786 @@
|
|||||||
|
{
|
||||||
|
"annotations": {
|
||||||
|
"list": [ ]
|
||||||
|
},
|
||||||
|
"editable": true,
|
||||||
|
"gnetId": null,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"hideControls": false,
|
||||||
|
"links": [ ],
|
||||||
|
"refresh": "30s",
|
||||||
|
"rows": [
|
||||||
|
{
|
||||||
|
"collapse": false,
|
||||||
|
"height": "250px",
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 1,
|
||||||
|
"id": 1,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 12,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"styles": [
|
||||||
|
{
|
||||||
|
"alias": "Time",
|
||||||
|
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||||
|
"pattern": "Time",
|
||||||
|
"type": "hidden"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alias": "Count",
|
||||||
|
"colorMode": null,
|
||||||
|
"colors": [ ],
|
||||||
|
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||||
|
"decimals": 2,
|
||||||
|
"link": false,
|
||||||
|
"linkTargetBlank": false,
|
||||||
|
"linkTooltip": "Drill down",
|
||||||
|
"linkUrl": "",
|
||||||
|
"pattern": "Value #A",
|
||||||
|
"thresholds": [ ],
|
||||||
|
"type": "hidden",
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alias": "Uptime",
|
||||||
|
"colorMode": null,
|
||||||
|
"colors": [ ],
|
||||||
|
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||||
|
"decimals": 2,
|
||||||
|
"link": false,
|
||||||
|
"linkTargetBlank": false,
|
||||||
|
"linkTooltip": "Drill down",
|
||||||
|
"linkUrl": "",
|
||||||
|
"pattern": "Value #B",
|
||||||
|
"thresholds": [ ],
|
||||||
|
"type": "number",
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alias": "Container",
|
||||||
|
"colorMode": null,
|
||||||
|
"colors": [ ],
|
||||||
|
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||||
|
"decimals": 2,
|
||||||
|
"link": false,
|
||||||
|
"linkTargetBlank": false,
|
||||||
|
"linkTooltip": "Drill down",
|
||||||
|
"linkUrl": "",
|
||||||
|
"pattern": "container",
|
||||||
|
"thresholds": [ ],
|
||||||
|
"type": "number",
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alias": "Pod",
|
||||||
|
"colorMode": null,
|
||||||
|
"colors": [ ],
|
||||||
|
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||||
|
"decimals": 2,
|
||||||
|
"link": false,
|
||||||
|
"linkTargetBlank": false,
|
||||||
|
"linkTooltip": "Drill down",
|
||||||
|
"linkUrl": "",
|
||||||
|
"pattern": "pod",
|
||||||
|
"thresholds": [ ],
|
||||||
|
"type": "number",
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alias": "Version",
|
||||||
|
"colorMode": null,
|
||||||
|
"colors": [ ],
|
||||||
|
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||||
|
"decimals": 2,
|
||||||
|
"link": false,
|
||||||
|
"linkTargetBlank": false,
|
||||||
|
"linkTooltip": "Drill down",
|
||||||
|
"linkUrl": "",
|
||||||
|
"pattern": "version",
|
||||||
|
"thresholds": [ ],
|
||||||
|
"type": "number",
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alias": "",
|
||||||
|
"colorMode": null,
|
||||||
|
"colors": [ ],
|
||||||
|
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||||
|
"decimals": 2,
|
||||||
|
"pattern": "/.*/",
|
||||||
|
"thresholds": [ ],
|
||||||
|
"type": "string",
|
||||||
|
"unit": "short"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count by (pod, container, version) (agent_build_info{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "",
|
||||||
|
"refId": "A",
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "max by (pod, container) (time() - process_start_time_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "",
|
||||||
|
"refId": "B",
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Agent Stats",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"transform": "table",
|
||||||
|
"type": "table",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"repeat": null,
|
||||||
|
"repeatIteration": null,
|
||||||
|
"repeatRowId": null,
|
||||||
|
"showTitle": true,
|
||||||
|
"title": "Agent Stats",
|
||||||
|
"titleSize": "h6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapse": false,
|
||||||
|
"height": "250px",
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 1,
|
||||||
|
"id": 2,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 6,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])) by (pod, scrape_job) * 1e3",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "{{pod}}/{{scrape_job}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Target Sync",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "ms",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 10,
|
||||||
|
"id": 3,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 0,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 6,
|
||||||
|
"stack": true,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (pod) (prometheus_sd_discovered_targets{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "{{pod}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Targets",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"repeat": null,
|
||||||
|
"repeatIteration": null,
|
||||||
|
"repeatRowId": null,
|
||||||
|
"showTitle": true,
|
||||||
|
"title": "Prometheus Discovery",
|
||||||
|
"titleSize": "h6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapse": false,
|
||||||
|
"height": "250px",
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 1,
|
||||||
|
"id": 4,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 4,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(prometheus_target_interval_length_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])\n/\nrate(prometheus_target_interval_length_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])\n* 1e3\n",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "{{pod}} {{interval}} configured",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Average Scrape Interval Duration",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "ms",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 10,
|
||||||
|
"id": 5,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 0,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 4,
|
||||||
|
"stack": true,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "exceeded sample limit: {{job}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "duplicate timestamp: {{job}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "out of bounds: {{job}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "out of order: {{job}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Scrape failures",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 10,
|
||||||
|
"id": 6,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 0,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 4,
|
||||||
|
"stack": true,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (job, instance_group_name) (rate(agent_wal_samples_appended_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m]))",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "{{job}} {{instance_group_name}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Appended Samples",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"repeat": null,
|
||||||
|
"repeatIteration": null,
|
||||||
|
"repeatRowId": null,
|
||||||
|
"showTitle": true,
|
||||||
|
"title": "Prometheus Retrieval",
|
||||||
|
"titleSize": "h6"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 14,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": [
|
||||||
|
"grafana-agent-mixin"
|
||||||
|
],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"current": {
|
||||||
|
"text": "default",
|
||||||
|
"value": "default"
|
||||||
|
},
|
||||||
|
"hide": 0,
|
||||||
|
"label": "Data Source",
|
||||||
|
"name": "datasource",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "prometheus",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"type": "datasource"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": ".+",
|
||||||
|
"current": {
|
||||||
|
"selected": true,
|
||||||
|
"text": "All",
|
||||||
|
"value": "$__all"
|
||||||
|
},
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": true,
|
||||||
|
"label": "cluster",
|
||||||
|
"multi": true,
|
||||||
|
"name": "cluster",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "label_values(agent_build_info, cluster)",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"sort": 2,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [ ],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": ".+",
|
||||||
|
"current": {
|
||||||
|
"selected": true,
|
||||||
|
"text": "All",
|
||||||
|
"value": "$__all"
|
||||||
|
},
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": true,
|
||||||
|
"label": "namespace",
|
||||||
|
"multi": true,
|
||||||
|
"name": "namespace",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "label_values(agent_build_info, namespace)",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"sort": 2,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [ ],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": ".+",
|
||||||
|
"current": {
|
||||||
|
"selected": true,
|
||||||
|
"text": "All",
|
||||||
|
"value": "$__all"
|
||||||
|
},
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": true,
|
||||||
|
"label": "container",
|
||||||
|
"multi": true,
|
||||||
|
"name": "container",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "label_values(agent_build_info, container)",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"sort": 2,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [ ],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": "grafana-agent-.*",
|
||||||
|
"current": {
|
||||||
|
"selected": true,
|
||||||
|
"text": "All",
|
||||||
|
"value": "$__all"
|
||||||
|
},
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": true,
|
||||||
|
"label": "pod",
|
||||||
|
"multi": true,
|
||||||
|
"name": "pod",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "label_values(agent_build_info{container=~\"$container\"}, pod)",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"sort": 2,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [ ],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"timepicker": {
|
||||||
|
"refresh_intervals": [
|
||||||
|
"5s",
|
||||||
|
"10s",
|
||||||
|
"30s",
|
||||||
|
"1m",
|
||||||
|
"5m",
|
||||||
|
"15m",
|
||||||
|
"30m",
|
||||||
|
"1h",
|
||||||
|
"2h",
|
||||||
|
"1d"
|
||||||
|
],
|
||||||
|
"time_options": [
|
||||||
|
"5m",
|
||||||
|
"15m",
|
||||||
|
"1h",
|
||||||
|
"6h",
|
||||||
|
"12h",
|
||||||
|
"24h",
|
||||||
|
"2d",
|
||||||
|
"7d",
|
||||||
|
"30d"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"timezone": "",
|
||||||
|
"title": "Agent",
|
||||||
|
"uid": "",
|
||||||
|
"version": 0
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
@@ -35,6 +35,7 @@
|
|||||||
"fill": 1,
|
"fill": 1,
|
||||||
"format": "none",
|
"format": "none",
|
||||||
"id": 1,
|
"id": 1,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -62,7 +63,6 @@
|
|||||||
"expr": "sum(loki_compactor_pending_delete_requests_count{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
|
"expr": "sum(loki_compactor_pending_delete_requests_count{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"instant": true,
|
"instant": true,
|
||||||
"intervalFactor": 2,
|
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -111,6 +111,7 @@
|
|||||||
"fill": 1,
|
"fill": 1,
|
||||||
"format": "dtdurations",
|
"format": "dtdurations",
|
||||||
"id": 2,
|
"id": 2,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -138,7 +139,6 @@
|
|||||||
"expr": "max(loki_compactor_oldest_pending_delete_request_age_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
|
"expr": "max(loki_compactor_oldest_pending_delete_request_age_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"instant": true,
|
"instant": true,
|
||||||
"intervalFactor": 2,
|
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -191,156 +191,148 @@
|
|||||||
"height": "250px",
|
"height": "250px",
|
||||||
"panels": [
|
"panels": [
|
||||||
{
|
{
|
||||||
"aliasColors": { },
|
|
||||||
"bars": false,
|
|
||||||
"dashLength": 10,
|
|
||||||
"dashes": false,
|
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fieldConfig": {
|
||||||
"id": 3,
|
"defaults": {
|
||||||
"legend": {
|
"custom": {
|
||||||
"avg": false,
|
"drawStyle": "line",
|
||||||
"current": false,
|
"fillOpacity": 10,
|
||||||
"max": false,
|
"lineWidth": 1,
|
||||||
"min": false,
|
"pointSize": 5,
|
||||||
"show": true,
|
"showPoints": "never",
|
||||||
"total": false,
|
"spanNulls": false,
|
||||||
"values": false
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [ ]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": [ ]
|
||||||
},
|
},
|
||||||
"lines": true,
|
"id": 3,
|
||||||
"linewidth": 1,
|
"interval": "1m",
|
||||||
"links": [ ],
|
"links": [ ],
|
||||||
"nullPointMode": "null as zero",
|
"options": {
|
||||||
"percentage": false,
|
"legend": {
|
||||||
"pointradius": 5,
|
"showLegend": true
|
||||||
"points": false,
|
},
|
||||||
"renderer": "flot",
|
"tooltip": {
|
||||||
"seriesOverrides": [ ],
|
"mode": "single",
|
||||||
"spaceLength": 10,
|
"sort": "none"
|
||||||
"span": 6,
|
}
|
||||||
"stack": false,
|
},
|
||||||
"steppedLine": false,
|
"span": 4,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"} or on() vector(0)) - on () (loki_compactor_delete_requests_processed_total{cluster=~\"$cluster\", namespace=~\"$namespace\"} or on () vector(0))",
|
||||||
|
"format": "time_series",
|
||||||
|
"legendFormat": "in progress",
|
||||||
|
"legendLink": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "# of Delete Requests (received - processed) ",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [ ]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": [ ]
|
||||||
|
},
|
||||||
|
"id": 4,
|
||||||
|
"interval": "1m",
|
||||||
|
"links": [ ],
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"showLegend": true
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single",
|
||||||
|
"sort": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"span": 4,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(increase(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
"expr": "sum(increase(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
|
||||||
"legendFormat": "received",
|
"legendFormat": "received",
|
||||||
"legendLink": null,
|
"legendLink": null
|
||||||
"step": 10
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thresholds": [ ],
|
|
||||||
"timeFrom": null,
|
|
||||||
"timeShift": null,
|
|
||||||
"title": "Delete Requests Received / Day",
|
"title": "Delete Requests Received / Day",
|
||||||
"tooltip": {
|
"type": "timeseries"
|
||||||
"shared": true,
|
|
||||||
"sort": 2,
|
|
||||||
"value_type": "individual"
|
|
||||||
},
|
|
||||||
"type": "graph",
|
|
||||||
"xaxis": {
|
|
||||||
"buckets": null,
|
|
||||||
"mode": "time",
|
|
||||||
"name": null,
|
|
||||||
"show": true,
|
|
||||||
"values": [ ]
|
|
||||||
},
|
|
||||||
"yaxes": [
|
|
||||||
{
|
|
||||||
"format": "short",
|
|
||||||
"label": null,
|
|
||||||
"logBase": 1,
|
|
||||||
"max": null,
|
|
||||||
"min": 0,
|
|
||||||
"show": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"format": "short",
|
|
||||||
"label": null,
|
|
||||||
"logBase": 1,
|
|
||||||
"max": null,
|
|
||||||
"min": null,
|
|
||||||
"show": false
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"aliasColors": { },
|
|
||||||
"bars": false,
|
|
||||||
"dashLength": 10,
|
|
||||||
"dashes": false,
|
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fieldConfig": {
|
||||||
"id": 4,
|
"defaults": {
|
||||||
"legend": {
|
"custom": {
|
||||||
"avg": false,
|
"drawStyle": "line",
|
||||||
"current": false,
|
"fillOpacity": 10,
|
||||||
"max": false,
|
"lineWidth": 1,
|
||||||
"min": false,
|
"pointSize": 5,
|
||||||
"show": true,
|
"showPoints": "never",
|
||||||
"total": false,
|
"spanNulls": false,
|
||||||
"values": false
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [ ]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": [ ]
|
||||||
},
|
},
|
||||||
"lines": true,
|
"id": 5,
|
||||||
"linewidth": 1,
|
"interval": "1m",
|
||||||
"links": [ ],
|
"links": [ ],
|
||||||
"nullPointMode": "null as zero",
|
"options": {
|
||||||
"percentage": false,
|
"legend": {
|
||||||
"pointradius": 5,
|
"showLegend": true
|
||||||
"points": false,
|
},
|
||||||
"renderer": "flot",
|
"tooltip": {
|
||||||
"seriesOverrides": [ ],
|
"mode": "single",
|
||||||
"spaceLength": 10,
|
"sort": "none"
|
||||||
"span": 6,
|
}
|
||||||
"stack": false,
|
},
|
||||||
"steppedLine": false,
|
"span": 4,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(increase(loki_compactor_delete_requests_processed_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
"expr": "sum(increase(loki_compactor_delete_requests_processed_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
|
||||||
"legendFormat": "processed",
|
"legendFormat": "processed",
|
||||||
"legendLink": null,
|
"legendLink": null
|
||||||
"step": 10
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thresholds": [ ],
|
|
||||||
"timeFrom": null,
|
|
||||||
"timeShift": null,
|
|
||||||
"title": "Delete Requests Processed / Day",
|
"title": "Delete Requests Processed / Day",
|
||||||
"tooltip": {
|
"type": "timeseries"
|
||||||
"shared": true,
|
|
||||||
"sort": 2,
|
|
||||||
"value_type": "individual"
|
|
||||||
},
|
|
||||||
"type": "graph",
|
|
||||||
"xaxis": {
|
|
||||||
"buckets": null,
|
|
||||||
"mode": "time",
|
|
||||||
"name": null,
|
|
||||||
"show": true,
|
|
||||||
"values": [ ]
|
|
||||||
},
|
|
||||||
"yaxes": [
|
|
||||||
{
|
|
||||||
"format": "short",
|
|
||||||
"label": null,
|
|
||||||
"logBase": 1,
|
|
||||||
"max": null,
|
|
||||||
"min": 0,
|
|
||||||
"show": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"format": "short",
|
|
||||||
"label": null,
|
|
||||||
"logBase": 1,
|
|
||||||
"max": null,
|
|
||||||
"min": null,
|
|
||||||
"show": false
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
@@ -355,87 +347,155 @@
|
|||||||
"height": "250px",
|
"height": "250px",
|
||||||
"panels": [
|
"panels": [
|
||||||
{
|
{
|
||||||
"aliasColors": { },
|
|
||||||
"bars": false,
|
|
||||||
"dashLength": 10,
|
|
||||||
"dashes": false,
|
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fieldConfig": {
|
||||||
"id": 5,
|
"defaults": {
|
||||||
"legend": {
|
"custom": {
|
||||||
"avg": false,
|
"drawStyle": "line",
|
||||||
"current": false,
|
"fillOpacity": 10,
|
||||||
"max": false,
|
"lineWidth": 1,
|
||||||
"min": false,
|
"pointSize": 5,
|
||||||
"show": true,
|
"showPoints": "never",
|
||||||
"total": false,
|
"spanNulls": false,
|
||||||
"values": false
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [ ]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": [ ]
|
||||||
},
|
},
|
||||||
"lines": true,
|
"id": 6,
|
||||||
"linewidth": 1,
|
"interval": "1m",
|
||||||
"links": [ ],
|
"links": [ ],
|
||||||
"nullPointMode": "null as zero",
|
"options": {
|
||||||
"percentage": false,
|
"legend": {
|
||||||
"pointradius": 5,
|
"showLegend": true
|
||||||
"points": false,
|
},
|
||||||
"renderer": "flot",
|
"tooltip": {
|
||||||
"seriesOverrides": [ ],
|
"mode": "single",
|
||||||
"spaceLength": 10,
|
"sort": "none"
|
||||||
"span": 12,
|
}
|
||||||
"stack": false,
|
},
|
||||||
"steppedLine": false,
|
"span": 4,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(increase(loki_compactor_load_pending_requests_attempts_total{status=\"fail\", cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]))",
|
"expr": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"}",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"legendFormat": "{{pod}}",
|
||||||
"legendFormat": "failures",
|
"legendLink": null
|
||||||
"legendLink": null,
|
|
||||||
"step": 10
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thresholds": [ ],
|
"title": "Compactor CPU usage",
|
||||||
"timeFrom": null,
|
"type": "timeseries"
|
||||||
"timeShift": null,
|
},
|
||||||
"title": "Failures in Loading Delete Requests / Hour",
|
{
|
||||||
"tooltip": {
|
"datasource": "$datasource",
|
||||||
"shared": true,
|
"fieldConfig": {
|
||||||
"sort": 2,
|
"defaults": {
|
||||||
"value_type": "individual"
|
"custom": {
|
||||||
},
|
"drawStyle": "line",
|
||||||
"type": "graph",
|
"fillOpacity": 10,
|
||||||
"xaxis": {
|
"lineWidth": 1,
|
||||||
"buckets": null,
|
"pointSize": 5,
|
||||||
"mode": "time",
|
"showPoints": "never",
|
||||||
"name": null,
|
"spanNulls": false,
|
||||||
"show": true,
|
"stacking": {
|
||||||
"values": [ ]
|
"group": "A",
|
||||||
},
|
"mode": "none"
|
||||||
"yaxes": [
|
}
|
||||||
{
|
},
|
||||||
"format": "short",
|
"thresholds": {
|
||||||
"label": null,
|
"mode": "absolute",
|
||||||
"logBase": 1,
|
"steps": [ ]
|
||||||
"max": null,
|
},
|
||||||
"min": 0,
|
"unit": "short"
|
||||||
"show": true
|
|
||||||
},
|
},
|
||||||
{
|
"overrides": [ ]
|
||||||
"format": "short",
|
},
|
||||||
"label": null,
|
"id": 7,
|
||||||
"logBase": 1,
|
"interval": "1m",
|
||||||
"max": null,
|
"links": [ ],
|
||||||
"min": null,
|
"options": {
|
||||||
"show": false
|
"legend": {
|
||||||
|
"showLegend": true
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single",
|
||||||
|
"sort": "none"
|
||||||
}
|
}
|
||||||
]
|
},
|
||||||
|
"span": 4,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"} / 1024 / 1024 ",
|
||||||
|
"format": "time_series",
|
||||||
|
"legendFormat": " {{pod}} ",
|
||||||
|
"legendLink": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Compactor memory usage (MiB)",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [ ]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": [ ]
|
||||||
|
},
|
||||||
|
"id": 8,
|
||||||
|
"interval": "1m",
|
||||||
|
"links": [ ],
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"showLegend": true
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single",
|
||||||
|
"sort": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"span": 4,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "loki_boltdb_shipper_compact_tables_operation_duration_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"}",
|
||||||
|
"format": "time_series",
|
||||||
|
"legendFormat": "{{pod}}",
|
||||||
|
"legendLink": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Compaction run duration (seconds)",
|
||||||
|
"type": "timeseries"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"repeatIteration": null,
|
"repeatIteration": null,
|
||||||
"repeatRowId": null,
|
"repeatRowId": null,
|
||||||
"showTitle": true,
|
"showTitle": true,
|
||||||
"title": "Failures",
|
"title": "Compactor",
|
||||||
"titleSize": "h6"
|
"titleSize": "h6"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -443,87 +503,147 @@
|
|||||||
"height": "250px",
|
"height": "250px",
|
||||||
"panels": [
|
"panels": [
|
||||||
{
|
{
|
||||||
"aliasColors": { },
|
|
||||||
"bars": false,
|
|
||||||
"dashLength": 10,
|
|
||||||
"dashes": false,
|
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fieldConfig": {
|
||||||
"id": 6,
|
"defaults": {
|
||||||
"legend": {
|
"custom": {
|
||||||
"avg": false,
|
"drawStyle": "line",
|
||||||
"current": false,
|
"fillOpacity": 10,
|
||||||
"max": false,
|
"lineWidth": 1,
|
||||||
"min": false,
|
"pointSize": 5,
|
||||||
"show": true,
|
"showPoints": "never",
|
||||||
"total": false,
|
"spanNulls": false,
|
||||||
"values": false
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [ ]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": [ ]
|
||||||
},
|
},
|
||||||
"lines": true,
|
"id": 9,
|
||||||
"linewidth": 1,
|
"interval": "1m",
|
||||||
"links": [ ],
|
"links": [ ],
|
||||||
"nullPointMode": "null as zero",
|
"options": {
|
||||||
"percentage": false,
|
"legend": {
|
||||||
"pointradius": 5,
|
"showLegend": true
|
||||||
"points": false,
|
},
|
||||||
"renderer": "flot",
|
"tooltip": {
|
||||||
"seriesOverrides": [ ],
|
"mode": "single",
|
||||||
"spaceLength": 10,
|
"sort": "none"
|
||||||
"span": 12,
|
}
|
||||||
"stack": false,
|
},
|
||||||
"steppedLine": false,
|
"span": 6,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\",job=~\"$namespace/(loki|enterprise-logs)-read\"}[$__rate_interval])) by (user)",
|
"expr": "sum(increase(loki_compactor_load_pending_requests_attempts_total{status=\"fail\", cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"legendFormat": "failures",
|
||||||
"legendFormat": "{{user}}",
|
"legendLink": null
|
||||||
"legendLink": null,
|
|
||||||
"step": 10
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thresholds": [ ],
|
"title": "Failures in Loading Delete Requests / Hour",
|
||||||
"timeFrom": null,
|
"type": "timeseries"
|
||||||
"timeShift": null,
|
},
|
||||||
"title": "Lines Deleted / Sec",
|
{
|
||||||
"tooltip": {
|
"datasource": "$datasource",
|
||||||
"shared": true,
|
"fieldConfig": {
|
||||||
"sort": 2,
|
"defaults": {
|
||||||
"value_type": "individual"
|
"custom": {
|
||||||
},
|
"drawStyle": "line",
|
||||||
"type": "graph",
|
"fillOpacity": 10,
|
||||||
"xaxis": {
|
"lineWidth": 1,
|
||||||
"buckets": null,
|
"pointSize": 5,
|
||||||
"mode": "time",
|
"showPoints": "never",
|
||||||
"name": null,
|
"spanNulls": false,
|
||||||
"show": true,
|
"stacking": {
|
||||||
"values": [ ]
|
"group": "A",
|
||||||
},
|
"mode": "none"
|
||||||
"yaxes": [
|
}
|
||||||
{
|
},
|
||||||
"format": "short",
|
"thresholds": {
|
||||||
"label": null,
|
"mode": "absolute",
|
||||||
"logBase": 1,
|
"steps": [ ]
|
||||||
"max": null,
|
},
|
||||||
"min": 0,
|
"unit": "short"
|
||||||
"show": true
|
|
||||||
},
|
},
|
||||||
{
|
"overrides": [ ]
|
||||||
"format": "short",
|
},
|
||||||
"label": null,
|
"id": 10,
|
||||||
"logBase": 1,
|
"interval": "1m",
|
||||||
"max": null,
|
"links": [ ],
|
||||||
"min": null,
|
"options": {
|
||||||
"show": false
|
"legend": {
|
||||||
|
"showLegend": true
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single",
|
||||||
|
"sort": "none"
|
||||||
}
|
}
|
||||||
]
|
},
|
||||||
|
"span": 6,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*/compactor|(loki|enterprise-logs)-backend.*|loki-single-binary)\"}[$__rate_interval])) by (user)",
|
||||||
|
"format": "time_series",
|
||||||
|
"legendFormat": "{{user}}",
|
||||||
|
"legendLink": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Lines Deleted / Sec",
|
||||||
|
"type": "timeseries"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"repeatIteration": null,
|
"repeatIteration": null,
|
||||||
"repeatRowId": null,
|
"repeatRowId": null,
|
||||||
"showTitle": true,
|
"showTitle": true,
|
||||||
"title": "Deleted lines",
|
"title": "Deletion metrics",
|
||||||
|
"titleSize": "h6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapse": false,
|
||||||
|
"height": "250px",
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": "$loki_datasource",
|
||||||
|
"id": 11,
|
||||||
|
"interval": "1m",
|
||||||
|
"span": 6,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*/compactor|(loki|enterprise-logs)-backend.*|loki-single-binary)\"} |~ \"Started processing delete request|delete request for user marked as processed\" | logfmt | line_format \"{{.ts}} user={{.user}} delete_request_id={{.delete_request_id}} msg={{.msg}}\" ",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "In progress/finished",
|
||||||
|
"type": "logs"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "$loki_datasource",
|
||||||
|
"id": 12,
|
||||||
|
"interval": "1m",
|
||||||
|
"span": 6,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*/compactor|(loki|enterprise-logs)-backend.*|loki-single-binary)\"} |~ \"delete request for user added\" | logfmt | line_format \"{{.ts}} user={{.user}} query='{{.query}}'\"",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Requests",
|
||||||
|
"type": "logs"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"repeat": null,
|
||||||
|
"repeatIteration": null,
|
||||||
|
"repeatRowId": null,
|
||||||
|
"showTitle": true,
|
||||||
|
"title": "List of deletion requests",
|
||||||
"titleSize": "h6"
|
"titleSize": "h6"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -540,7 +660,7 @@
|
|||||||
"value": "default"
|
"value": "default"
|
||||||
},
|
},
|
||||||
"hide": 0,
|
"hide": 0,
|
||||||
"label": "Data Source",
|
"label": "Data source",
|
||||||
"name": "datasource",
|
"name": "datasource",
|
||||||
"options": [ ],
|
"options": [ ],
|
||||||
"query": "prometheus",
|
"query": "prometheus",
|
||||||
@@ -593,6 +713,16 @@
|
|||||||
"tagsQuery": "",
|
"tagsQuery": "",
|
||||||
"type": "query",
|
"type": "query",
|
||||||
"useTags": false
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hide": 0,
|
||||||
|
"label": null,
|
||||||
|
"name": "loki_datasource",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "loki",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"type": "datasource"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@@ -6,7 +6,6 @@
|
|||||||
"gnetId": null,
|
"gnetId": null,
|
||||||
"graphTooltip": 0,
|
"graphTooltip": 0,
|
||||||
"hideControls": false,
|
"hideControls": false,
|
||||||
"id": 8,
|
|
||||||
"iteration": 1583185057230,
|
"iteration": 1583185057230,
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
@@ -39,6 +38,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 35,
|
"id": 35,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -78,7 +78,7 @@
|
|||||||
"sort": 0,
|
"sort": 0,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -115,6 +115,11 @@
|
|||||||
"dashLength": 10,
|
"dashLength": 10,
|
||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s"
|
||||||
|
}
|
||||||
|
},
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"fillGradient": 0,
|
"fillGradient": 0,
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
@@ -125,6 +130,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 41,
|
"id": 41,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -165,7 +171,7 @@
|
|||||||
"sort": 0,
|
"sort": 0,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -212,6 +218,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 36,
|
"id": 36,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -237,7 +244,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))",
|
"expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[$__rate_interval]))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -251,7 +258,7 @@
|
|||||||
"sort": 0,
|
"sort": 0,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -288,6 +295,11 @@
|
|||||||
"dashLength": 10,
|
"dashLength": 10,
|
||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes"
|
||||||
|
}
|
||||||
|
},
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"fillGradient": 0,
|
"fillGradient": 0,
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
@@ -298,6 +310,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 40,
|
"id": 40,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -337,7 +350,7 @@
|
|||||||
"sort": 0,
|
"sort": 0,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -374,6 +387,11 @@
|
|||||||
"dashLength": 10,
|
"dashLength": 10,
|
||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "binBps"
|
||||||
|
}
|
||||||
|
},
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"fillGradient": 0,
|
"fillGradient": 0,
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
@@ -384,6 +402,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 38,
|
"id": 38,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -409,7 +428,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))",
|
"expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -423,7 +442,7 @@
|
|||||||
"sort": 0,
|
"sort": 0,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -460,6 +479,11 @@
|
|||||||
"dashLength": 10,
|
"dashLength": 10,
|
||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "binBps"
|
||||||
|
}
|
||||||
|
},
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"fillGradient": 0,
|
"fillGradient": 0,
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
@@ -470,6 +494,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 39,
|
"id": 39,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -495,7 +520,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))",
|
"expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -509,7 +534,7 @@
|
|||||||
"sort": 0,
|
"sort": 0,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -556,6 +581,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 37,
|
"id": 37,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -596,7 +622,7 @@
|
|||||||
"sort": 0,
|
"sort": 0,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -633,6 +659,11 @@
|
|||||||
"dashLength": 10,
|
"dashLength": 10,
|
||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "ops"
|
||||||
|
}
|
||||||
|
},
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"fillGradient": 0,
|
"fillGradient": 0,
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
@@ -643,6 +674,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 42,
|
"id": 42,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -668,7 +700,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)",
|
"expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[$__rate_interval])) by (level)",
|
||||||
"legendFormat": "{{level}}",
|
"legendFormat": "{{level}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
@@ -683,7 +715,7 @@
|
|||||||
"sort": 0,
|
"sort": 0,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -719,7 +751,12 @@
|
|||||||
"bars": false,
|
"bars": false,
|
||||||
"dashLength": 10,
|
"dashLength": 10,
|
||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$logs",
|
"datasource": "$loki_datasource",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "ops"
|
||||||
|
}
|
||||||
|
},
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"fillGradient": 0,
|
"fillGradient": 0,
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
@@ -730,6 +767,7 @@
|
|||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 31,
|
"id": 31,
|
||||||
|
"interval": "1m",
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -772,7 +810,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)",
|
"expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" | __error__=\"\" [$__interval])) by (level)",
|
||||||
"intervalFactor": 3,
|
"intervalFactor": 3,
|
||||||
"legendFormat": "{{level}}",
|
"legendFormat": "{{level}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
@@ -788,7 +826,7 @@
|
|||||||
"sort": 2,
|
"sort": 2,
|
||||||
"value_type": "individual"
|
"value_type": "individual"
|
||||||
},
|
},
|
||||||
"type": "graph",
|
"type": "timeseries",
|
||||||
"xaxis": {
|
"xaxis": {
|
||||||
"buckets": null,
|
"buckets": null,
|
||||||
"mode": "time",
|
"mode": "time",
|
||||||
@@ -820,7 +858,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"datasource": "$logs",
|
"datasource": "$loki_datasource",
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 19,
|
"h": 19,
|
||||||
"w": 24,
|
"w": 24,
|
||||||
@@ -828,6 +866,7 @@
|
|||||||
"y": 6
|
"y": 6
|
||||||
},
|
},
|
||||||
"id": 29,
|
"id": 29,
|
||||||
|
"interval": "1m",
|
||||||
"maxDataPoints": "",
|
"maxDataPoints": "",
|
||||||
"options": {
|
"options": {
|
||||||
"showLabels": false,
|
"showLabels": false,
|
||||||
@@ -862,7 +901,7 @@
|
|||||||
"value": "default"
|
"value": "default"
|
||||||
},
|
},
|
||||||
"hide": 0,
|
"hide": 0,
|
||||||
"label": "Data Source",
|
"label": "Data source",
|
||||||
"name": "datasource",
|
"name": "datasource",
|
||||||
"options": [ ],
|
"options": [ ],
|
||||||
"query": "prometheus",
|
"query": "prometheus",
|
||||||
@@ -919,7 +958,7 @@
|
|||||||
{
|
{
|
||||||
"hide": 0,
|
"hide": 0,
|
||||||
"label": null,
|
"label": null,
|
||||||
"name": "logs",
|
"name": "loki_datasource",
|
||||||
"options": [ ],
|
"options": [ ],
|
||||||
"query": "loki",
|
"query": "loki",
|
||||||
"refresh": 1,
|
"refresh": 1,
|
||||||
|
@@ -1,657 +0,0 @@
|
|||||||
{
|
|
||||||
"annotations": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"builtIn": 1,
|
|
||||||
"datasource": "-- Grafana --",
|
|
||||||
"enable": true,
|
|
||||||
"hide": true,
|
|
||||||
"iconColor": "rgba(0, 211, 255, 1)",
|
|
||||||
"name": "Annotations & Alerts",
|
|
||||||
"target": {
|
|
||||||
"limit": 100,
|
|
||||||
"matchAny": false,
|
|
||||||
"tags": [ ],
|
|
||||||
"type": "dashboard"
|
|
||||||
},
|
|
||||||
"type": "dashboard"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"enable": false,
|
|
||||||
"expr": "sum by (tenant) (changes(loki_ruler_wal_prometheus_tsdb_wal_truncations_total{tenant=~\"${tenant}\"}[$__rate_interval]))",
|
|
||||||
"iconColor": "red",
|
|
||||||
"name": "WAL Truncations",
|
|
||||||
"target": {
|
|
||||||
"queryType": "Azure Monitor",
|
|
||||||
"refId": "Anno"
|
|
||||||
},
|
|
||||||
"titleFormat": "{{tenant}}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"editable": true,
|
|
||||||
"fiscalYearStartMonth": 0,
|
|
||||||
"gnetId": null,
|
|
||||||
"graphTooltip": 0,
|
|
||||||
"iteration": 1635347545534,
|
|
||||||
"links": [ ],
|
|
||||||
"liveNow": false,
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [ ],
|
|
||||||
"noValue": "0",
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 1
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": [ ]
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 10,
|
|
||||||
"w": 2,
|
|
||||||
"x": 0,
|
|
||||||
"y": 0
|
|
||||||
},
|
|
||||||
"id": 2,
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "auto",
|
|
||||||
"orientation": "auto",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "auto"
|
|
||||||
},
|
|
||||||
"pluginVersion": "8.3.0-38205pre",
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"exemplar": false,
|
|
||||||
"expr": "sum(loki_ruler_wal_appender_ready) by (pod, tenant) == 0",
|
|
||||||
"instant": true,
|
|
||||||
"interval": "",
|
|
||||||
"legendFormat": "",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "Appenders Not Ready",
|
|
||||||
"type": "stat"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"description": "",
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "palette-classic"
|
|
||||||
},
|
|
||||||
"custom": {
|
|
||||||
"axisLabel": "",
|
|
||||||
"axisPlacement": "auto",
|
|
||||||
"barAlignment": 0,
|
|
||||||
"drawStyle": "line",
|
|
||||||
"fillOpacity": 0,
|
|
||||||
"gradientMode": "none",
|
|
||||||
"hideFrom": {
|
|
||||||
"legend": false,
|
|
||||||
"tooltip": false,
|
|
||||||
"viz": false
|
|
||||||
},
|
|
||||||
"lineInterpolation": "linear",
|
|
||||||
"lineWidth": 1,
|
|
||||||
"pointSize": 5,
|
|
||||||
"scaleDistribution": {
|
|
||||||
"type": "linear"
|
|
||||||
},
|
|
||||||
"showPoints": "auto",
|
|
||||||
"spanNulls": false,
|
|
||||||
"stacking": {
|
|
||||||
"group": "A",
|
|
||||||
"mode": "none"
|
|
||||||
},
|
|
||||||
"thresholdsStyle": {
|
|
||||||
"mode": "off"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mappings": [ ],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 80
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": [ ]
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 10,
|
|
||||||
"w": 11,
|
|
||||||
"x": 2,
|
|
||||||
"y": 0
|
|
||||||
},
|
|
||||||
"id": 4,
|
|
||||||
"options": {
|
|
||||||
"legend": {
|
|
||||||
"calcs": [ ],
|
|
||||||
"displayMode": "list",
|
|
||||||
"placement": "bottom"
|
|
||||||
},
|
|
||||||
"tooltip": {
|
|
||||||
"mode": "single"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"exemplar": true,
|
|
||||||
"expr": "sum(rate(loki_ruler_wal_samples_appended_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
|
|
||||||
"interval": "",
|
|
||||||
"legendFormat": "{{tenant}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "Samples Appended to WAL per Second",
|
|
||||||
"type": "timeseries"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"description": "Series are unique combinations of labels",
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "palette-classic"
|
|
||||||
},
|
|
||||||
"custom": {
|
|
||||||
"axisLabel": "",
|
|
||||||
"axisPlacement": "auto",
|
|
||||||
"barAlignment": 0,
|
|
||||||
"drawStyle": "line",
|
|
||||||
"fillOpacity": 0,
|
|
||||||
"gradientMode": "none",
|
|
||||||
"hideFrom": {
|
|
||||||
"legend": false,
|
|
||||||
"tooltip": false,
|
|
||||||
"viz": false
|
|
||||||
},
|
|
||||||
"lineInterpolation": "linear",
|
|
||||||
"lineWidth": 1,
|
|
||||||
"pointSize": 5,
|
|
||||||
"scaleDistribution": {
|
|
||||||
"type": "linear"
|
|
||||||
},
|
|
||||||
"showPoints": "auto",
|
|
||||||
"spanNulls": false,
|
|
||||||
"stacking": {
|
|
||||||
"group": "A",
|
|
||||||
"mode": "none"
|
|
||||||
},
|
|
||||||
"thresholdsStyle": {
|
|
||||||
"mode": "off"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mappings": [ ],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 80
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": [ ]
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 10,
|
|
||||||
"w": 11,
|
|
||||||
"x": 13,
|
|
||||||
"y": 0
|
|
||||||
},
|
|
||||||
"id": 5,
|
|
||||||
"options": {
|
|
||||||
"legend": {
|
|
||||||
"calcs": [ ],
|
|
||||||
"displayMode": "list",
|
|
||||||
"placement": "bottom"
|
|
||||||
},
|
|
||||||
"tooltip": {
|
|
||||||
"mode": "single"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"exemplar": true,
|
|
||||||
"expr": "sum(rate(loki_ruler_wal_storage_created_series_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
|
|
||||||
"interval": "",
|
|
||||||
"legendFormat": "{{tenant}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "Series Created per Second",
|
|
||||||
"type": "timeseries"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"description": "Difference between highest timestamp appended to WAL and highest timestamp successfully written to remote storage",
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "palette-classic"
|
|
||||||
},
|
|
||||||
"custom": {
|
|
||||||
"axisLabel": "",
|
|
||||||
"axisPlacement": "auto",
|
|
||||||
"barAlignment": 0,
|
|
||||||
"drawStyle": "line",
|
|
||||||
"fillOpacity": 0,
|
|
||||||
"gradientMode": "none",
|
|
||||||
"hideFrom": {
|
|
||||||
"legend": false,
|
|
||||||
"tooltip": false,
|
|
||||||
"viz": false
|
|
||||||
},
|
|
||||||
"lineInterpolation": "linear",
|
|
||||||
"lineWidth": 1,
|
|
||||||
"pointSize": 5,
|
|
||||||
"scaleDistribution": {
|
|
||||||
"type": "linear"
|
|
||||||
},
|
|
||||||
"showPoints": "auto",
|
|
||||||
"spanNulls": false,
|
|
||||||
"stacking": {
|
|
||||||
"group": "A",
|
|
||||||
"mode": "none"
|
|
||||||
},
|
|
||||||
"thresholdsStyle": {
|
|
||||||
"mode": "off"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mappings": [ ],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 80
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": [ ]
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 10,
|
|
||||||
"w": 12,
|
|
||||||
"x": 0,
|
|
||||||
"y": 10
|
|
||||||
},
|
|
||||||
"id": 6,
|
|
||||||
"options": {
|
|
||||||
"legend": {
|
|
||||||
"calcs": [ ],
|
|
||||||
"displayMode": "list",
|
|
||||||
"placement": "bottom"
|
|
||||||
},
|
|
||||||
"tooltip": {
|
|
||||||
"mode": "single"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"exemplar": true,
|
|
||||||
"expr": "loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds{tenant=~\"${tenant}\"}\n- on (tenant)\n (\n loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds{tenant=~\"${tenant}\"}\n or vector(0)\n )",
|
|
||||||
"interval": "",
|
|
||||||
"legendFormat": "{{tenant}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "Write Behind",
|
|
||||||
"type": "timeseries"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"description": "",
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "palette-classic"
|
|
||||||
},
|
|
||||||
"custom": {
|
|
||||||
"axisLabel": "",
|
|
||||||
"axisPlacement": "auto",
|
|
||||||
"barAlignment": 0,
|
|
||||||
"drawStyle": "line",
|
|
||||||
"fillOpacity": 0,
|
|
||||||
"gradientMode": "none",
|
|
||||||
"hideFrom": {
|
|
||||||
"legend": false,
|
|
||||||
"tooltip": false,
|
|
||||||
"viz": false
|
|
||||||
},
|
|
||||||
"lineInterpolation": "linear",
|
|
||||||
"lineWidth": 1,
|
|
||||||
"pointSize": 5,
|
|
||||||
"scaleDistribution": {
|
|
||||||
"type": "linear"
|
|
||||||
},
|
|
||||||
"showPoints": "auto",
|
|
||||||
"spanNulls": false,
|
|
||||||
"stacking": {
|
|
||||||
"group": "A",
|
|
||||||
"mode": "none"
|
|
||||||
},
|
|
||||||
"thresholdsStyle": {
|
|
||||||
"mode": "off"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mappings": [ ],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 80
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": [ ]
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 10,
|
|
||||||
"w": 12,
|
|
||||||
"x": 12,
|
|
||||||
"y": 10
|
|
||||||
},
|
|
||||||
"id": 7,
|
|
||||||
"options": {
|
|
||||||
"legend": {
|
|
||||||
"calcs": [ ],
|
|
||||||
"displayMode": "list",
|
|
||||||
"placement": "bottom"
|
|
||||||
},
|
|
||||||
"tooltip": {
|
|
||||||
"mode": "single"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"exemplar": true,
|
|
||||||
"expr": "sum(rate(loki_ruler_wal_prometheus_remote_storage_samples_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
|
|
||||||
"interval": "",
|
|
||||||
"legendFormat": "{{tenant}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "Samples Sent per Second",
|
|
||||||
"type": "timeseries"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"description": "\n",
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "palette-classic"
|
|
||||||
},
|
|
||||||
"custom": {
|
|
||||||
"axisLabel": "",
|
|
||||||
"axisPlacement": "auto",
|
|
||||||
"barAlignment": 0,
|
|
||||||
"drawStyle": "line",
|
|
||||||
"fillOpacity": 0,
|
|
||||||
"gradientMode": "none",
|
|
||||||
"hideFrom": {
|
|
||||||
"legend": false,
|
|
||||||
"tooltip": false,
|
|
||||||
"viz": false
|
|
||||||
},
|
|
||||||
"lineInterpolation": "linear",
|
|
||||||
"lineWidth": 1,
|
|
||||||
"pointSize": 5,
|
|
||||||
"scaleDistribution": {
|
|
||||||
"type": "linear"
|
|
||||||
},
|
|
||||||
"showPoints": "auto",
|
|
||||||
"spanNulls": false,
|
|
||||||
"stacking": {
|
|
||||||
"group": "A",
|
|
||||||
"mode": "none"
|
|
||||||
},
|
|
||||||
"thresholdsStyle": {
|
|
||||||
"mode": "off"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mappings": [ ],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 80
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "bytes"
|
|
||||||
},
|
|
||||||
"overrides": [ ]
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 10,
|
|
||||||
"w": 12,
|
|
||||||
"x": 0,
|
|
||||||
"y": 20
|
|
||||||
},
|
|
||||||
"id": 8,
|
|
||||||
"options": {
|
|
||||||
"legend": {
|
|
||||||
"calcs": [ ],
|
|
||||||
"displayMode": "list",
|
|
||||||
"placement": "bottom"
|
|
||||||
},
|
|
||||||
"tooltip": {
|
|
||||||
"mode": "single"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"exemplar": true,
|
|
||||||
"expr": "sum by (tenant) (loki_ruler_wal_disk_size{tenant=~\"${tenant}\"})",
|
|
||||||
"interval": "",
|
|
||||||
"legendFormat": "{{tenant}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "WAL Disk Size",
|
|
||||||
"type": "timeseries"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"description": "Some number of pending samples is expected, but if remote-write is failing this value will remain high",
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "palette-classic"
|
|
||||||
},
|
|
||||||
"custom": {
|
|
||||||
"axisLabel": "",
|
|
||||||
"axisPlacement": "auto",
|
|
||||||
"barAlignment": 0,
|
|
||||||
"drawStyle": "line",
|
|
||||||
"fillOpacity": 0,
|
|
||||||
"gradientMode": "none",
|
|
||||||
"hideFrom": {
|
|
||||||
"legend": false,
|
|
||||||
"tooltip": false,
|
|
||||||
"viz": false
|
|
||||||
},
|
|
||||||
"lineInterpolation": "linear",
|
|
||||||
"lineWidth": 1,
|
|
||||||
"pointSize": 5,
|
|
||||||
"scaleDistribution": {
|
|
||||||
"type": "linear"
|
|
||||||
},
|
|
||||||
"showPoints": "auto",
|
|
||||||
"spanNulls": false,
|
|
||||||
"stacking": {
|
|
||||||
"group": "A",
|
|
||||||
"mode": "none"
|
|
||||||
},
|
|
||||||
"thresholdsStyle": {
|
|
||||||
"mode": "off"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mappings": [ ],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "red",
|
|
||||||
"value": 80
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": [ ]
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 10,
|
|
||||||
"w": 12,
|
|
||||||
"x": 12,
|
|
||||||
"y": 20
|
|
||||||
},
|
|
||||||
"id": 9,
|
|
||||||
"options": {
|
|
||||||
"legend": {
|
|
||||||
"calcs": [ ],
|
|
||||||
"displayMode": "list",
|
|
||||||
"placement": "bottom"
|
|
||||||
},
|
|
||||||
"tooltip": {
|
|
||||||
"mode": "single"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"exemplar": true,
|
|
||||||
"expr": "max(loki_ruler_wal_prometheus_remote_storage_samples_pending{tenant=~\"${tenant}\"}) by (tenant,pod) > 0",
|
|
||||||
"interval": "",
|
|
||||||
"legendFormat": "{{tenant}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "Pending Samples",
|
|
||||||
"type": "timeseries"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"schemaVersion": 31,
|
|
||||||
"style": "dark",
|
|
||||||
"tags": [ ],
|
|
||||||
"templating": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"description": null,
|
|
||||||
"error": null,
|
|
||||||
"hide": 0,
|
|
||||||
"includeAll": false,
|
|
||||||
"label": "Datasource",
|
|
||||||
"multi": false,
|
|
||||||
"name": "datasource",
|
|
||||||
"options": [ ],
|
|
||||||
"query": "prometheus",
|
|
||||||
"queryValue": "",
|
|
||||||
"refresh": 1,
|
|
||||||
"regex": "",
|
|
||||||
"skipUrlSync": false,
|
|
||||||
"type": "datasource"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"allValue": null,
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"definition": "label_values(loki_ruler_wal_samples_appended_total, tenant)",
|
|
||||||
"description": null,
|
|
||||||
"error": null,
|
|
||||||
"hide": 0,
|
|
||||||
"includeAll": true,
|
|
||||||
"label": "Tenant",
|
|
||||||
"multi": true,
|
|
||||||
"name": "tenant",
|
|
||||||
"options": [ ],
|
|
||||||
"query": {
|
|
||||||
"query": "label_values(loki_ruler_wal_samples_appended_total, tenant)",
|
|
||||||
"refId": "StandardVariableQuery"
|
|
||||||
},
|
|
||||||
"refresh": 2,
|
|
||||||
"regex": "",
|
|
||||||
"skipUrlSync": false,
|
|
||||||
"sort": 0,
|
|
||||||
"type": "query"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"time": {
|
|
||||||
"from": "now-6h",
|
|
||||||
"to": "now"
|
|
||||||
},
|
|
||||||
"timepicker": { },
|
|
||||||
"timezone": "",
|
|
||||||
"title": "Recording Rules",
|
|
||||||
"uid": "2xKA_ZK7k",
|
|
||||||
"version": 9,
|
|
||||||
"weekStart": ""
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
52
charts/meta-monitoring/src/rules/loki-rules.yaml
Normal file
52
charts/meta-monitoring/src/rules/loki-rules.yaml
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
- name: "loki_rules"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:loki_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:loki_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[5m]))
|
||||||
|
by (cluster, job)"
|
||||||
|
record: "cluster_job:loki_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job)"
|
||||||
|
record: "cluster_job:loki_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:loki_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:loki_request_duration_seconds_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, route))"
|
||||||
|
record: "cluster_job_route:loki_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, route))"
|
||||||
|
record: "cluster_job_route:loki_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job, route)
|
||||||
|
/ sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||||
|
record: "cluster_job_route:loki_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, job,
|
||||||
|
route)"
|
||||||
|
record: "cluster_job_route:loki_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, job, route)"
|
||||||
|
record: "cluster_job_route:loki_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||||
|
record: "cluster_job_route:loki_request_duration_seconds_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, namespace, job, route))"
|
||||||
|
record: "cluster_namespace_job_route:loki_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, namespace, job, route))"
|
||||||
|
record: "cluster_namespace_job_route:loki_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||||
|
job, route) / sum(rate(loki_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:loki_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_bucket[5m])) by (le, cluster, namespace,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(loki_request_duration_seconds_count[5m])) by (cluster, namespace,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate"
|
555
charts/meta-monitoring/src/rules/mimir-rules.yaml
Normal file
555
charts/meta-monitoring/src/rules/mimir-rules.yaml
Normal file
@@ -0,0 +1,555 @@
|
|||||||
|
groups:
|
||||||
|
- name: "mimir_api_1"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[5m]))
|
||||||
|
by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job)"
|
||||||
|
record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_request_duration_seconds_count:sum_rate"
|
||||||
|
- name: "mimir_api_2"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, route))"
|
||||||
|
record: "cluster_job_route:cortex_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, route))"
|
||||||
|
record: "cluster_job_route:cortex_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)
|
||||||
|
/ sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||||
|
record: "cluster_job_route:cortex_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job,
|
||||||
|
route)"
|
||||||
|
record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)"
|
||||||
|
record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)"
|
||||||
|
record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate"
|
||||||
|
- name: "mimir_api_3"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, namespace, job, route))"
|
||||||
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, namespace, job, route))"
|
||||||
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||||
|
job, route) / sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, namespace,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, namespace,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate"
|
||||||
|
- name: "mimir_querier_api"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_querier_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_querier_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
job) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_querier_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, route))"
|
||||||
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, route))"
|
||||||
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by
|
||||||
|
(cluster, job, route)"
|
||||||
|
record: "cluster_job_route:cortex_querier_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
job, route)"
|
||||||
|
record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, namespace, job, route))"
|
||||||
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, namespace, job, route))"
|
||||||
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m]))
|
||||||
|
by (cluster, namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
|
namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
|
||||||
|
- name: "mimir_cache"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, method))"
|
||||||
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, method))"
|
||||||
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[5m]))
|
||||||
|
by (cluster, job, method)"
|
||||||
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
|
job, method)"
|
||||||
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
job, method)"
|
||||||
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_memcache_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
job, method)"
|
||||||
|
record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_cache_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_cache_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_cache_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, method))"
|
||||||
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job, method))"
|
||||||
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job,
|
||||||
|
method) / sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
job, method)"
|
||||||
|
record: "cluster_job_method:cortex_cache_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
|
job, method)"
|
||||||
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job,
|
||||||
|
method)"
|
||||||
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
|
||||||
|
job, method)"
|
||||||
|
record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate"
|
||||||
|
- name: "mimir_storage"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_kv_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_kv_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_kv_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[5m])) by (le, cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate"
|
||||||
|
- name: "mimir_queries"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_query_frontend_retries:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_query_frontend_retries:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[5m]))
|
||||||
|
by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_query_frontend_retries:avg"
|
||||||
|
- expr: "sum(rate(cortex_query_frontend_retries_bucket[5m])) by (le, cluster, job)"
|
||||||
|
record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_query_frontend_retries_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_query_frontend_retries_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by
|
||||||
|
(cluster, job)"
|
||||||
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m])) by (le,
|
||||||
|
cluster, job)"
|
||||||
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by (cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate"
|
||||||
|
- name: "mimir_ingester_queries"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_series:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_series:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[5m]))
|
||||||
|
by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_series:avg"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_series_bucket[5m])) by (le, cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_series_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_series_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_samples:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_samples:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[5m]))
|
||||||
|
by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_samples:avg"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_samples_bucket[5m])) by (le, cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_samples_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate"
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_exemplars:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[5m]))
|
||||||
|
by (le, cluster, job))"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_exemplars:50quantile"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job) /
|
||||||
|
sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_exemplars:avg"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[5m])) by (le, cluster,
|
||||||
|
job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)"
|
||||||
|
record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate"
|
||||||
|
- name: "mimir_received_samples"
|
||||||
|
rules:
|
||||||
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))"
|
||||||
|
record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m"
|
||||||
|
- name: "mimir_exemplars_in"
|
||||||
|
rules:
|
||||||
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))"
|
||||||
|
record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m"
|
||||||
|
- name: "mimir_received_exemplars"
|
||||||
|
rules:
|
||||||
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))"
|
||||||
|
record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m"
|
||||||
|
- name: "mimir_exemplars_ingested"
|
||||||
|
rules:
|
||||||
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))"
|
||||||
|
record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m"
|
||||||
|
- name: "mimir_exemplars_appended"
|
||||||
|
rules:
|
||||||
|
- expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))"
|
||||||
|
record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m"
|
||||||
|
- name: "mimir_scaling_rules"
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
kube_deployment_spec_replicas,
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
|
||||||
|
)
|
||||||
|
record: "cluster_namespace_deployment:actual_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by (cluster, namespace) (
|
||||||
|
cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
/ 240000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: "distributor"
|
||||||
|
reason: "sample_rate"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||||
|
* 0.59999999999999998 / 240000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: "distributor"
|
||||||
|
reason: "sample_rate_limits"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by (cluster, namespace) (
|
||||||
|
cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
* 3 / 80000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: "ingester"
|
||||||
|
reason: "sample_rate"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by(cluster, namespace) (
|
||||||
|
cortex_ingester_memory_series
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
/ 1500000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: "ingester"
|
||||||
|
reason: "active_series"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
|
||||||
|
* 3 * 0.59999999999999998 / 1500000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: "ingester"
|
||||||
|
reason: "active_series_limits"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||||
|
* 0.59999999999999998 / 80000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: "ingester"
|
||||||
|
reason: "sample_rate_limits"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
(sum by (cluster, namespace) (
|
||||||
|
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
|
||||||
|
) / 4)
|
||||||
|
/
|
||||||
|
avg by (cluster, namespace) (
|
||||||
|
memcached_limit_bytes{job=~".+/memcached"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: "memcached"
|
||||||
|
reason: "active_series"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[5m])),
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate"
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the CPU request for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
||||||
|
# that remove resource metrics, ref:
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
|
||||||
|
#
|
||||||
|
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
|
||||||
|
# where kube_pod_container_resource_requests_cpu_cores was removed:
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests_cpu_cores,
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
# This expression is compatible with kube-state-metrics >= v1.4.0,
|
||||||
|
# where kube_pod_container_resource_requests was introduced.
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests{resource="cpu"},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum"
|
||||||
|
- expr: |
|
||||||
|
# Jobs should be sized to their CPU usage.
|
||||||
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||||
|
# their current provisioned #replicas and resource requests.
|
||||||
|
ceil(
|
||||||
|
cluster_namespace_deployment:actual_replicas:count
|
||||||
|
*
|
||||||
|
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
|
||||||
|
/
|
||||||
|
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
reason: "cpu_usage"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
container_memory_usage_bytes{image!=""},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: "cluster_namespace_deployment:container_memory_usage_bytes:sum"
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the Memory request for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
||||||
|
# that remove resource metrics, ref:
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
|
||||||
|
#
|
||||||
|
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
|
||||||
|
# where kube_pod_container_resource_requests_memory_bytes was removed:
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests_memory_bytes,
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
# This expression is compatible with kube-state-metrics >= v1.4.0,
|
||||||
|
# where kube_pod_container_resource_requests was introduced.
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests{resource="memory"},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum"
|
||||||
|
- expr: |
|
||||||
|
# Jobs should be sized to their Memory usage.
|
||||||
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||||
|
# their current provisioned #replicas and resource requests.
|
||||||
|
ceil(
|
||||||
|
cluster_namespace_deployment:actual_replicas:count
|
||||||
|
*
|
||||||
|
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
|
||||||
|
/
|
||||||
|
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
reason: "memory_usage"
|
||||||
|
record: "cluster_namespace_deployment_reason:required_replicas:count"
|
||||||
|
- name: "mimir_alertmanager_rules"
|
||||||
|
rules:
|
||||||
|
- expr: "sum by (cluster, job, pod) (cortex_alertmanager_alerts)"
|
||||||
|
record: "cluster_job_pod:cortex_alertmanager_alerts:sum"
|
||||||
|
- expr: "sum by (cluster, job, pod) (cortex_alertmanager_silences)"
|
||||||
|
record: "cluster_job_pod:cortex_alertmanager_silences:sum"
|
||||||
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))"
|
||||||
|
record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m"
|
||||||
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))"
|
||||||
|
record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m"
|
||||||
|
- expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))"
|
||||||
|
record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m"
|
||||||
|
- expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))"
|
||||||
|
record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m"
|
||||||
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))"
|
||||||
|
record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m"
|
||||||
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))"
|
||||||
|
record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m"
|
||||||
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))"
|
||||||
|
record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m"
|
||||||
|
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))"
|
||||||
|
record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m"
|
||||||
|
- name: "mimir_ingester_rules"
|
||||||
|
rules:
|
||||||
|
- expr: "sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[5m]))"
|
||||||
|
record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m"
|
15
charts/meta-monitoring/src/rules/tempo-rules.yaml
Normal file
15
charts/meta-monitoring/src/rules/tempo-rules.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
groups:
|
||||||
|
- name: "tempo_rules"
|
||||||
|
rules:
|
||||||
|
- expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))"
|
||||||
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile"
|
||||||
|
- expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))"
|
||||||
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile"
|
||||||
|
- expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg"
|
||||||
|
- expr: "sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate"
|
||||||
|
- expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate"
|
||||||
|
- expr: "sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)"
|
||||||
|
record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"
|
33
charts/meta-monitoring/templates/_helpers.tpl
Normal file
33
charts/meta-monitoring/templates/_helpers.tpl
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
{{/*
|
||||||
|
Return the appropriate apiVersion for ingress.
|
||||||
|
*/}}
|
||||||
|
{{- define "ingress.apiVersion" -}}
|
||||||
|
{{- if and (.Capabilities.APIVersions.Has "networking.k8s.io/v1") (semverCompare ">= 1.19-0" .Capabilities.KubeVersion.Version) -}}
|
||||||
|
{{- print "networking.k8s.io/v1" -}}
|
||||||
|
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" -}}
|
||||||
|
{{- print "networking.k8s.io/v1beta1" -}}
|
||||||
|
{{- else -}}
|
||||||
|
{{- print "extensions/v1beta1" -}}
|
||||||
|
{{- end -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Return if ingress is stable.
|
||||||
|
*/}}
|
||||||
|
{{- define "ingress.isStable" -}}
|
||||||
|
{{- eq (include "ingress.apiVersion" .) "networking.k8s.io/v1" -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Return if ingress supports ingressClassName.
|
||||||
|
*/}}
|
||||||
|
{{- define "ingress.supportsIngressClassName" -}}
|
||||||
|
{{- or (eq (include "ingress.isStable" .) "true") (and (eq (include "ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) -}}
|
||||||
|
{{- end -}}
|
||||||
|
|
||||||
|
{{/*
|
||||||
|
Return if ingress supports pathType.
|
||||||
|
*/}}
|
||||||
|
{{- define "ingress.supportsPathType" -}}
|
||||||
|
{{- or (eq (include "ingress.isStable" .) "true") (and (eq (include "ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18-0" .Capabilities.KubeVersion.Version)) -}}
|
||||||
|
{{- end -}}
|
@@ -6,6 +6,15 @@
|
|||||||
{{- join ", " $list }}
|
{{- join ", " $list }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
{{- define "agent.all_namespaces" -}}
|
||||||
|
{{- $list := list }}
|
||||||
|
{{- range .Values.namespacesToMonitor }}
|
||||||
|
{{- $list = append $list (printf "%s" .) }}
|
||||||
|
{{- end }}
|
||||||
|
{{- $list = append $list .Release.Namespace }}
|
||||||
|
{{- join "|" $list }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
{{- define "agent.loki_write_targets" -}}
|
{{- define "agent.loki_write_targets" -}}
|
||||||
{{- $list := list }}
|
{{- $list := list }}
|
||||||
{{- if .Values.local.logs.enabled }}
|
{{- if .Values.local.logs.enabled }}
|
||||||
@@ -17,6 +26,14 @@
|
|||||||
{{- join ", " $list }}
|
{{- join ", " $list }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
{{- define "agent.loki_process_targets" -}}
|
||||||
|
{{- if and (empty .Values.logs.piiRegexes) (empty .Values.logs.retain) }}
|
||||||
|
{{- include "agent.loki_write_targets" . }}
|
||||||
|
{{- else }}
|
||||||
|
{{- printf "loki.process.filter.receiver" }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
{{- define "agent.prometheus_write_targets" -}}
|
{{- define "agent.prometheus_write_targets" -}}
|
||||||
{{- $list := list }}
|
{{- $list := list }}
|
||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.local.metrics.enabled }}
|
||||||
@@ -31,10 +48,32 @@
|
|||||||
{{- define "agent.tempo_write_targets" -}}
|
{{- define "agent.tempo_write_targets" -}}
|
||||||
{{- $list := list }}
|
{{- $list := list }}
|
||||||
{{- if .Values.local.traces.enabled }}
|
{{- if .Values.local.traces.enabled }}
|
||||||
{{- $list = append $list ("otelcol.exporter.otlp.local.input") }}
|
{{- $list = append $list ("otelcol.exporter.otlphttp.local.input") }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- if .Values.cloud.traces.enabled }}
|
{{- if .Values.cloud.traces.enabled }}
|
||||||
{{- $list = append $list ("otelcol.exporter.otlp.cloud.input") }}
|
{{- $list = append $list ("otelcol.exporter.otlphttp.cloud.input") }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- join ", " $list }}
|
{{- join ", " $list }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- define "agent.all_logs" -}}
|
||||||
|
{{- $list := list }}
|
||||||
|
{{- range .Values.logs.retain }}
|
||||||
|
{{- $list = append $list . }}
|
||||||
|
{{- end }}
|
||||||
|
{{- range .Values.logs.extraLogs }}
|
||||||
|
{{- $list = append $list . }}
|
||||||
|
{{- end }}
|
||||||
|
{{- join "|" $list }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- define "agent.all_metrics" -}}
|
||||||
|
{{- $list := list }}
|
||||||
|
{{- range .Values.metrics.retain }}
|
||||||
|
{{- $list = append $list . }}
|
||||||
|
{{- end }}
|
||||||
|
{{- range .Values.metrics.extraMetrics }}
|
||||||
|
{{- $list = append $list . }}
|
||||||
|
{{- end }}
|
||||||
|
{{- join "|" $list }}
|
||||||
{{- end }}
|
{{- end }}
|
@@ -8,7 +8,7 @@ data:
|
|||||||
discovery.kubernetes "pods" {
|
discovery.kubernetes "pods" {
|
||||||
role = "pod"
|
role = "pod"
|
||||||
namespaces {
|
namespaces {
|
||||||
own_namespace = false
|
own_namespace = true
|
||||||
names = [ {{ include "agent.namespaces" . }} ]
|
names = [ {{ include "agent.namespaces" . }} ]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -33,38 +33,283 @@ data:
|
|||||||
}
|
}
|
||||||
rule {
|
rule {
|
||||||
target_label = "cluster"
|
target_label = "cluster"
|
||||||
replacement = "{{- .Values.clusterName -}}"
|
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
||||||
loki.source.kubernetes "pods" {
|
// Logs
|
||||||
targets = discovery.relabel.rename_meta_labels.output
|
|
||||||
forward_to = [ {{ include "agent.loki_write_targets" . }} ]
|
{{- if .Values.cloud.logs.enabled }}
|
||||||
|
remote.kubernetes.secret "logs_credentials" {
|
||||||
|
namespace = "{{- $.Release.Namespace -}}"
|
||||||
|
name = "{{- .Values.cloud.logs.secret -}}"
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
loki.source.kubernetes "pods" {
|
||||||
prometheus.scrape "pods" {
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
targets = discovery.relabel.rename_meta_labels.output
|
targets = discovery.relabel.rename_meta_labels.output
|
||||||
|
forward_to = [ {{ include "agent.loki_process_targets" . }} ]
|
||||||
|
}
|
||||||
|
|
||||||
|
{{- if or (not (empty .Values.logs.retain)) (not (empty .Values.logs.piiRegexes)) }}
|
||||||
|
loki.process "filter" {
|
||||||
|
forward_to = [ {{ include "agent.loki_write_targets" . }} ]
|
||||||
|
|
||||||
|
{{- if or (not (empty .Values.logs.retain)) (not (empty .Values.logs.extraLogs)) }}
|
||||||
|
stage.match {
|
||||||
|
selector = "{cluster=\"{{- .Values.clusterLabelValue -}}\", namespace=~\"{{- join "|" .Values.namespacesToMonitor -}}|{{- $.Release.Namespace -}}\", pod=~\"loki.*\"} !~ \"{{ include "agent.all_logs" . }}\""
|
||||||
|
action = "drop"
|
||||||
|
}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if not (empty .Values.logs.piiRegexes) }}
|
||||||
|
{{- range .Values.logs.piiRegexes }}
|
||||||
|
stage.replace {
|
||||||
|
expression = "{{ .expression }}"
|
||||||
|
source = "{{ .source }}"
|
||||||
|
replace = "{{ .replace }}"
|
||||||
|
}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
||||||
|
// Metrics
|
||||||
|
|
||||||
|
{{- if .Values.cloud.metrics.enabled }}
|
||||||
|
remote.kubernetes.secret "metrics_credentials" {
|
||||||
|
namespace = "{{- $.Release.Namespace -}}"
|
||||||
|
name = "{{- .Values.cloud.metrics.secret -}}"
|
||||||
|
}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
discovery.kubernetes "metric_pods" {
|
||||||
|
role = "pod"
|
||||||
|
namespaces {
|
||||||
|
own_namespace = true
|
||||||
|
names = [ {{ include "agent.namespaces" . }} ]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
discovery.relabel "only_http_metrics" {
|
||||||
|
targets = discovery.kubernetes.metric_pods.targets
|
||||||
|
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||||
|
separator = "/"
|
||||||
|
regex = "(.*)/(.*)/(.*)"
|
||||||
|
replacement = "${1}/${2}-${3}"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
target_label = "cluster"
|
||||||
|
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_container_port_number"]
|
||||||
|
action = "drop"
|
||||||
|
regex = "9095"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.scrape "pods" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
targets = discovery.relabel.only_http_metrics.output
|
||||||
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.relabel "filter" {
|
||||||
|
rule {
|
||||||
|
source_labels = ["__name__"]
|
||||||
|
regex = "({{ include "agent.all_metrics" . }})"
|
||||||
|
action = "keep"
|
||||||
|
}
|
||||||
|
|
||||||
|
rule {
|
||||||
|
source_labels = ["namespace"]
|
||||||
|
regex = "{{ include "agent.all_namespaces" . }}"
|
||||||
|
|
||||||
|
action = "keep"
|
||||||
|
}
|
||||||
|
|
||||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
}
|
}
|
||||||
|
{{- if .Values.kubeStateMetrics.enabled }}
|
||||||
|
|
||||||
|
prometheus.scrape "kubeStateMetrics" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
targets = [ { "__address__" = "{{ .Values.kubeStateMetrics.endpoint }}" } ]
|
||||||
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
// cAdvisor and Kubelet metrics
|
||||||
|
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
|
||||||
|
discovery.kubernetes "all_nodes" {
|
||||||
|
role = "node"
|
||||||
|
namespaces {
|
||||||
|
own_namespace = true
|
||||||
|
names = [ {{ include "agent.namespaces" . }} ]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
discovery.relabel "all_nodes" {
|
||||||
|
targets = discovery.kubernetes.all_nodes.targets
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_node_name"]
|
||||||
|
target_label = "node"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||||
|
separator = "/"
|
||||||
|
regex = "(.*)/(.*)/(.*)"
|
||||||
|
replacement = "${1}/${2}-${3}"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
target_label = "cluster"
|
||||||
|
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.scrape "cadvisor" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
targets = discovery.relabel.all_nodes.output
|
||||||
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
|
||||||
|
metrics_path = "/metrics/cadvisor"
|
||||||
|
scheme = "https"
|
||||||
|
|
||||||
|
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||||
|
tls_config {
|
||||||
|
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.scrape "kubelet" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
targets = discovery.relabel.all_nodes.output
|
||||||
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
|
||||||
|
metrics_path = "/metrics"
|
||||||
|
scheme = "https"
|
||||||
|
|
||||||
|
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||||
|
tls_config {
|
||||||
|
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.exporter.unix "promexporter" {}
|
||||||
|
|
||||||
|
prometheus.scrape "node_exporter" {
|
||||||
|
clustering {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
targets = prometheus.exporter.unix.promexporter.targets
|
||||||
|
forward_to = [prometheus.relabel.node_exporter.receiver]
|
||||||
|
|
||||||
|
job_name = "node-exporter"
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.relabel "node_exporter" {
|
||||||
|
forward_to = [ prometheus.relabel.filter.receiver ]
|
||||||
|
|
||||||
|
rule {
|
||||||
|
replacement = env("HOSTNAME")
|
||||||
|
target_label = "nodename"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
replacement = "node-exporter"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_node_name"]
|
||||||
|
target_label = "node"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||||
|
separator = "/"
|
||||||
|
regex = "(.*)/(.*)/(.*)"
|
||||||
|
replacement = "${1}/${2}-${3}"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
target_label = "cluster"
|
||||||
|
replacement = "{{- .Values.clusterLabelValue -}}"
|
||||||
|
}
|
||||||
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
||||||
|
// Traces
|
||||||
|
|
||||||
|
{{- if .Values.cloud.traces.enabled }}
|
||||||
|
remote.kubernetes.secret "traces_credentials" {
|
||||||
|
namespace = "{{- $.Release.Namespace -}}"
|
||||||
|
name = "{{- .Values.cloud.traces.secret -}}"
|
||||||
|
}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
||||||
otelcol.receiver.otlp "otlp_receiver" {
|
otelcol.receiver.otlp "otlp_receiver" {
|
||||||
// We don't technically need this, but it shows how to change listen address and incoming port.
|
// We don't technically need this, but it shows how to change listen address and incoming port.
|
||||||
// In this case, the Agent is listening on all available bindable addresses on port 4317 (which is the
|
// In this case, the Agent is listening on all available bindable addresses on port 4317 (which is the
|
||||||
// default OTLP gRPC port) for the OTLP protocol.
|
// default OTLP gRPC port) for the OTLP protocol.
|
||||||
grpc {
|
grpc {}
|
||||||
endpoint = "0.0.0.0:4317"
|
|
||||||
}
|
|
||||||
|
|
||||||
// We define where to send the output of all ingested traces. In this case, to the OpenTelemetry batch processor
|
// We define where to send the output of all ingested traces. In this case, to the OpenTelemetry batch processor
|
||||||
// named 'default'.
|
// named 'default'.
|
||||||
output {
|
output {
|
||||||
traces = [otelcol.processor.batch.default.input]
|
traces = [otelcol.processor.batch.default.input]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
otelcol.receiver.jaeger "jaeger" {
|
||||||
|
protocols {
|
||||||
|
thrift_http {}
|
||||||
|
}
|
||||||
|
|
||||||
|
output {
|
||||||
|
traces = [otelcol.processor.batch.default.input]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -85,7 +330,7 @@ data:
|
|||||||
{{- if .Values.local.logs.enabled }}
|
{{- if .Values.local.logs.enabled }}
|
||||||
loki.write "local" {
|
loki.write "local" {
|
||||||
endpoint {
|
endpoint {
|
||||||
url = "http://loki-gateway.{{- .Release.Namespace -}}.svc.cluster.local:80/loki/api/v1/push"
|
url = "http://{{- .Release.Namespace -}}-loki-gateway.{{- .Release.Namespace -}}.svc.cluster.local:80/loki/api/v1/push"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
@@ -98,21 +343,10 @@ data:
|
|||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
{{- if .Values.local.traces.enabled }}
|
||||||
// The OpenTelemetry exporter exports processed trace spans to another target that is listening for OTLP format traces.
|
otelcol.exporter.otlphttp "local" {
|
||||||
// A unique label, 'local', is added to uniquely identify this exporter.
|
|
||||||
otelcol.exporter.otlp "local" {
|
|
||||||
// Define the client for exporting.
|
|
||||||
client {
|
client {
|
||||||
// Send to the locally running Tempo instance, on port 4317 (OTLP gRPC).
|
endpoint = "http://{{- .Release.Name -}}-tempo-distributor.svc:4318"
|
||||||
endpoint = "meta-tempo-distributor:4317"
|
|
||||||
// Configure TLS settings for communicating with the endpoint.
|
|
||||||
tls {
|
|
||||||
// The connection is insecure.
|
|
||||||
insecure = true
|
|
||||||
// Do not verify TLS certificates when connecting.
|
|
||||||
insecure_skip_verify = true
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
@@ -120,11 +354,10 @@ data:
|
|||||||
{{- if .Values.cloud.logs.enabled }}
|
{{- if .Values.cloud.logs.enabled }}
|
||||||
loki.write "cloud" {
|
loki.write "cloud" {
|
||||||
endpoint {
|
endpoint {
|
||||||
url = "{{- .Values.cloud.logs.endpoint -}}/loki/api/v1/push"
|
url = nonsensitive(remote.kubernetes.secret.logs_credentials.data["endpoint"])
|
||||||
|
|
||||||
basic_auth {
|
basic_auth {
|
||||||
username = "{{- .Values.cloud.logs.username -}}"
|
username = nonsensitive(remote.kubernetes.secret.logs_credentials.data["username"])
|
||||||
password = "{{- .Values.cloud.logs.password -}}"
|
password = remote.kubernetes.secret.logs_credentials.data["password"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -133,26 +366,25 @@ data:
|
|||||||
{{- if .Values.cloud.metrics.enabled }}
|
{{- if .Values.cloud.metrics.enabled }}
|
||||||
prometheus.remote_write "cloud" {
|
prometheus.remote_write "cloud" {
|
||||||
endpoint {
|
endpoint {
|
||||||
url = "{{- .Values.cloud.metrics.endpoint -}}/api/prom/push"
|
url = nonsensitive(remote.kubernetes.secret.metrics_credentials.data["endpoint"])
|
||||||
|
|
||||||
basic_auth {
|
basic_auth {
|
||||||
username = "{{- .Values.cloud.metrics.username -}}"
|
username = nonsensitive(remote.kubernetes.secret.metrics_credentials.data["username"])
|
||||||
password = "{{- .Values.cloud.metrics.password -}}"
|
password = remote.kubernetes.secret.metrics_credentials.data["password"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
{{- if .Values.cloud.traces.enabled }}
|
{{- if .Values.cloud.traces.enabled }}
|
||||||
otelcol.exporter.otlp "cloud" {
|
otelcol.exporter.otlphttp "cloud" {
|
||||||
client {
|
client {
|
||||||
endpoint = "{{- .Values.cloud.traces.endpoint -}}"
|
endpoint = nonsensitive(remote.kubernetes.secret.traces_credentials.data["endpoint"])
|
||||||
auth = otelcol.auth.basic.creds.handler
|
auth = otelcol.auth.basic.creds.handler
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
otelcol.auth.basic "creds" {
|
otelcol.auth.basic "creds" {
|
||||||
username = "{{- .Values.cloud.traces.username -}}"
|
username = nonsensitive(remote.kubernetes.secret.traces_credentials.data["username"])
|
||||||
password = "{{- .Values.cloud.traces.password -}}"
|
password = remote.kubernetes.secret.traces_credentials.data["password"]
|
||||||
}
|
}
|
||||||
{{- end }}
|
{{- end }}
|
@@ -0,0 +1,19 @@
|
|||||||
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: agent-dashboards-1
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
data:
|
||||||
|
"agent-logs-pipeline.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/agent-logs-pipeline.json" | fromJson | toJson }}
|
||||||
|
"agent-operational.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/agent-operational.json" | fromJson | toJson }}
|
||||||
|
"agent-remote-write.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/agent-remote-write.json" | fromJson | toJson }}
|
||||||
|
"agent-tracing-pipeline.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/agent-tracing-pipeline.json" | fromJson | toJson }}
|
||||||
|
"agent.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/agent.json" | fromJson | toJson }}
|
||||||
|
{{- end }}
|
@@ -1,15 +1,16 @@
|
|||||||
{{- if .Values.local.logs.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
metadata:
|
metadata:
|
||||||
name: loki-dashboards-provisioning
|
name: dashboards-provisioning
|
||||||
namespace: {{ $.Release.Namespace }}
|
namespace: {{ $.Release.Namespace }}
|
||||||
data:
|
data:
|
||||||
dashboards.yaml: |
|
dashboards.yaml: |
|
||||||
---
|
---
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
providers:
|
providers:
|
||||||
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
- disableDeletion: true
|
- disableDeletion: true
|
||||||
editable: false
|
editable: false
|
||||||
folder: Loki
|
folder: Loki
|
||||||
@@ -26,4 +27,13 @@ data:
|
|||||||
path: /var/lib/grafana/dashboards/loki-2
|
path: /var/lib/grafana/dashboards/loki-2
|
||||||
orgId: 1
|
orgId: 1
|
||||||
type: file
|
type: file
|
||||||
|
{{- end }}
|
||||||
|
- disableDeletion: true
|
||||||
|
editable: false
|
||||||
|
folder: Agent
|
||||||
|
name: agent-1
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards/agent-1
|
||||||
|
orgId: 1
|
||||||
|
type: file
|
||||||
{{- end }}
|
{{- end }}
|
@@ -1,9 +1,9 @@
|
|||||||
{{- if or (or .Values.local.logs.enabled .Values.local.metrics.enabled) .Values.local.traces.enabled }}
|
{{- if .Values.local.grafana.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
metadata:
|
metadata:
|
||||||
name: loki-datasources-provisioning
|
name: datasources-provisioning
|
||||||
namespace: {{ $.Release.Namespace }}
|
namespace: {{ $.Release.Namespace }}
|
||||||
data:
|
data:
|
||||||
datasources.yaml: |
|
datasources.yaml: |
|
||||||
@@ -12,7 +12,7 @@ data:
|
|||||||
|
|
||||||
# List of data sources to delete from the database.
|
# List of data sources to delete from the database.
|
||||||
deleteDatasources:
|
deleteDatasources:
|
||||||
- name: Loki
|
- name: Loki
|
||||||
orgId: 1
|
orgId: 1
|
||||||
|
|
||||||
# List of data sources to insert/update depending on what's
|
# List of data sources to insert/update depending on what's
|
||||||
@@ -32,7 +32,7 @@ data:
|
|||||||
uid: loki_ds
|
uid: loki_ds
|
||||||
# <string> Sets the data source's URL, including the
|
# <string> Sets the data source's URL, including the
|
||||||
# port.
|
# port.
|
||||||
url: http://loki-gateway.{{- $.Release.Namespace -}}.svc.cluster.local
|
url: http://{{- $.Release.Namespace -}}-loki-gateway.{{- $.Release.Namespace -}}.svc.cluster.local
|
||||||
# <bool> Toggles whether the data source is pre-selected
|
# <bool> Toggles whether the data source is pre-selected
|
||||||
# for new panels. You can set only one default
|
# for new panels. You can set only one default
|
||||||
# data source per organization.
|
# data source per organization.
|
||||||
@@ -61,6 +61,10 @@ data:
|
|||||||
# <bool> Allows users to edit data sources from the
|
# <bool> Allows users to edit data sources from the
|
||||||
# Grafana UI.
|
# Grafana UI.
|
||||||
editable: true
|
editable: true
|
||||||
|
# Extra config.
|
||||||
|
jsonData:
|
||||||
|
# Scrape interval
|
||||||
|
timeInterval: 1m
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- if .Values.local.traces.enabled }}
|
{{- if .Values.local.traces.enabled }}
|
||||||
- name: Tempo
|
- name: Tempo
|
||||||
|
@@ -1,16 +1,4 @@
|
|||||||
{{- if or (or .Values.local.logs.enabled .Values.local.metrics.enabled) .Values.local.traces.enabled }}
|
{{- if .Values.local.grafana.enabled }}
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: grafana-pvc
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 1Gi
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
@@ -32,7 +20,7 @@ spec:
|
|||||||
- 0
|
- 0
|
||||||
containers:
|
containers:
|
||||||
- name: grafana
|
- name: grafana
|
||||||
image: grafana/grafana:10.0.0
|
image: grafana/grafana:{{- .Values.grafana.version }}
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 3000
|
- containerPort: 3000
|
||||||
@@ -64,26 +52,30 @@ spec:
|
|||||||
- mountPath: /var/lib/grafana
|
- mountPath: /var/lib/grafana
|
||||||
name: grafana-pv
|
name: grafana-pv
|
||||||
- mountPath: /etc/grafana/provisioning/datasources
|
- mountPath: /etc/grafana/provisioning/datasources
|
||||||
name: loki-datasources-provisioning
|
name: datasources-provisioning
|
||||||
{{- if .Values.local.logs.enabled }}
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
- mountPath: /etc/grafana/provisioning/dashboards
|
- mountPath: /etc/grafana/provisioning/dashboards
|
||||||
name: loki-dashboards-provisioning
|
name: dashboards-provisioning
|
||||||
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
- mountPath: /var/lib/grafana/dashboards/loki-1
|
- mountPath: /var/lib/grafana/dashboards/loki-1
|
||||||
name: loki-dashboards-1
|
name: loki-dashboards-1
|
||||||
- mountPath: /var/lib/grafana/dashboards/loki-2
|
- mountPath: /var/lib/grafana/dashboards/loki-2
|
||||||
name: loki-dashboards-2
|
name: loki-dashboards-2
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
- mountPath: /var/lib/grafana/dashboards/agent-1
|
||||||
|
name: agent-dashboards-1
|
||||||
volumes:
|
volumes:
|
||||||
- name: grafana-pv
|
- name: grafana-pv
|
||||||
persistentVolumeClaim:
|
persistentVolumeClaim:
|
||||||
claimName: grafana-pvc
|
claimName: grafana-pvc
|
||||||
- name: loki-datasources-provisioning
|
- name: datasources-provisioning
|
||||||
configMap:
|
configMap:
|
||||||
name: loki-datasources-provisioning
|
name: datasources-provisioning
|
||||||
{{- if .Values.local.logs.enabled }}
|
- name: dashboards-provisioning
|
||||||
- name: loki-dashboards-provisioning
|
|
||||||
configMap:
|
configMap:
|
||||||
name: loki-dashboards-provisioning
|
name: dashboards-provisioning
|
||||||
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
- name: loki-dashboards-1
|
- name: loki-dashboards-1
|
||||||
configMap:
|
configMap:
|
||||||
name: loki-dashboards-1
|
name: loki-dashboards-1
|
||||||
@@ -91,19 +83,7 @@ spec:
|
|||||||
configMap:
|
configMap:
|
||||||
name: loki-dashboards-2
|
name: loki-dashboards-2
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
- name: agent-dashboards-1
|
||||||
---
|
configMap:
|
||||||
apiVersion: v1
|
name: agent-dashboards-1
|
||||||
kind: Service
|
{{- end }}
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
spec:
|
|
||||||
ports:
|
|
||||||
- port: 3000
|
|
||||||
protocol: TCP
|
|
||||||
targetPort: http-grafana
|
|
||||||
selector:
|
|
||||||
app: grafana
|
|
||||||
sessionAffinity: None
|
|
||||||
type: ClusterIP # Make this configurable
|
|
||||||
{{- end }}
|
|
@@ -0,0 +1,57 @@
|
|||||||
|
{{- if and .Values.local.grafana.enabled .Values.grafana.ingress.enabled -}}
|
||||||
|
{{- $ingressApiIsStable := eq (include "ingress.isStable" .) "true" -}}
|
||||||
|
{{- $ingressSupportsIngressClassName := eq (include "ingress.supportsIngressClassName" .) "true" -}}
|
||||||
|
{{- $ingressSupportsPathType := eq (include "ingress.supportsPathType" .) "true" -}}
|
||||||
|
apiVersion: {{ include "ingress.apiVersion" . }}
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
labels:
|
||||||
|
app: grafana
|
||||||
|
{{- range $labelKey, $labelValue := .Values.grafana.ingress.labels }}
|
||||||
|
{{ $labelKey }}: {{ $labelValue | toYaml }}
|
||||||
|
{{- end }}
|
||||||
|
{{- with .Values.grafana.ingress.annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
{{- if and $ingressSupportsIngressClassName .Values.grafana.ingress.ingressClassName }}
|
||||||
|
ingressClassName: {{ .Values.grafana.ingress.ingressClassName }}
|
||||||
|
{{- end -}}
|
||||||
|
{{- if .Values.grafana.ingress.tls }}
|
||||||
|
tls:
|
||||||
|
{{- range .Values.grafana.ingress.tls }}
|
||||||
|
- hosts:
|
||||||
|
{{- range .hosts }}
|
||||||
|
- {{ tpl . $ | quote }}
|
||||||
|
{{- end }}
|
||||||
|
{{- with .secretName }}
|
||||||
|
secretName: {{ . }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
rules:
|
||||||
|
{{- range .Values.grafana.ingress.hosts }}
|
||||||
|
- host: {{ tpl .host $ | quote }}
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
{{- range .paths }}
|
||||||
|
- path: {{ .path }}
|
||||||
|
{{- if $ingressSupportsPathType }}
|
||||||
|
pathType: {{ .pathType }}
|
||||||
|
{{- end }}
|
||||||
|
backend:
|
||||||
|
{{- if $ingressApiIsStable }}
|
||||||
|
service:
|
||||||
|
name: grafana
|
||||||
|
port:
|
||||||
|
number: 3000
|
||||||
|
{{- else }}
|
||||||
|
serviceName: grafana
|
||||||
|
servicePort: 3000
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
12
charts/meta-monitoring/templates/grafana/grafana-pvc.yaml
Normal file
12
charts/meta-monitoring/templates/grafana/grafana-pvc.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{{- if .Values.local.grafana.enabled }}
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: grafana-pvc
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
{{- end }}
|
@@ -0,0 +1,15 @@
|
|||||||
|
{{- if .Values.local.grafana.enabled }}
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
spec:
|
||||||
|
ports:
|
||||||
|
- port: 3000
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: http-grafana
|
||||||
|
selector:
|
||||||
|
app: grafana
|
||||||
|
sessionAffinity: None
|
||||||
|
type: ClusterIP # Make this configurable
|
||||||
|
{{- end }}
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.logs.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.logs.enabled }}
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
129
charts/meta-monitoring/templates/ruler/ruler.yaml
Normal file
129
charts/meta-monitoring/templates/ruler/ruler.yaml
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
{{- if .Values.local.grafana.enabled }}
|
||||||
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: {{ $.Release.Namespace }}-mimir-ruler-for-dashboards
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
spec:
|
||||||
|
progressDeadlineSeconds: 600
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 10
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: ruler-for-dashboards
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
strategy:
|
||||||
|
rollingUpdate:
|
||||||
|
maxSurge: 50%
|
||||||
|
maxUnavailable: 0
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: ruler-for-dashboards
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- args:
|
||||||
|
- -target=ruler
|
||||||
|
- -log.level=debug
|
||||||
|
- -ruler-storage.backend=local
|
||||||
|
- -ruler-storage.local.directory=/etc/rules
|
||||||
|
- -ruler.ring.prefix=dashboards/
|
||||||
|
- -config.expand-env=true
|
||||||
|
- -config.file=/etc/mimir/mimir.yaml
|
||||||
|
image: grafana/mimir:2.8.0
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
name: ruler
|
||||||
|
ports:
|
||||||
|
- containerPort: 8080
|
||||||
|
name: http-metrics
|
||||||
|
protocol: TCP
|
||||||
|
- containerPort: 9095
|
||||||
|
name: grpc
|
||||||
|
protocol: TCP
|
||||||
|
- containerPort: 7946
|
||||||
|
name: memberlist
|
||||||
|
protocol: TCP
|
||||||
|
envFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: minio
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: http-metrics
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 45
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 1
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 128Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
terminationMessagePath: /dev/termination-log
|
||||||
|
terminationMessagePolicy: File
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /etc/mimir
|
||||||
|
name: config
|
||||||
|
- mountPath: /var/mimir
|
||||||
|
name: runtime-config
|
||||||
|
- mountPath: /data
|
||||||
|
name: storage
|
||||||
|
- mountPath: /active-query-tracker
|
||||||
|
name: active-queries
|
||||||
|
- mountPath: /etc/rules/anonymous
|
||||||
|
name: rules
|
||||||
|
dnsPolicy: ClusterFirst
|
||||||
|
restartPolicy: Always
|
||||||
|
schedulerName: default-scheduler
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 10001
|
||||||
|
runAsGroup: 10001
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 10001
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
terminationGracePeriodSeconds: 180
|
||||||
|
topologySpreadConstraints:
|
||||||
|
- labelSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: ruler
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
maxSkew: 1
|
||||||
|
topologyKey: kubernetes.io/hostname
|
||||||
|
whenUnsatisfiable: ScheduleAnyway
|
||||||
|
volumes:
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
items:
|
||||||
|
- key: mimir.yaml
|
||||||
|
path: mimir.yaml
|
||||||
|
name: {{ $.Release.Namespace }}-mimir-config
|
||||||
|
name: config
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
name: {{ $.Release.Namespace }}-mimir-runtime
|
||||||
|
name: runtime-config
|
||||||
|
- emptyDir: {}
|
||||||
|
name: storage
|
||||||
|
- emptyDir: {}
|
||||||
|
name: active-queries
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
name: rules
|
||||||
|
name: rules
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
14
charts/meta-monitoring/templates/ruler/rules-configmap.yaml
Normal file
14
charts/meta-monitoring/templates/ruler/rules-configmap.yaml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{{- if .Values.local.metrics.enabled }}
|
||||||
|
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: rules
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
data:
|
||||||
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
|
{{ ($.Files.Glob "src/rules/loki-rules.yaml").AsConfig | indent 2 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
@@ -3,20 +3,20 @@
|
|||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
{{- if eq .Values.cloud.logs.enabled true -}}
|
{{- if eq .Values.cloud.logs.enabled true -}}
|
||||||
{{- if or (empty .Values.cloud.logs.endpoint) (or (empty .Values.cloud.logs.username) (empty .Values.cloud.logs.password)) -}}
|
{{- if empty .Values.cloud.logs.secret -}}
|
||||||
{{- fail "if cloud.logs is enabled then the endpoint, username and password have to be filled in" -}}
|
{{- fail "if cloud.logs is enabled then the secret has to be filled in" -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
{{- if eq .Values.cloud.metrics.enabled true -}}
|
{{- if eq .Values.cloud.metrics.enabled true -}}
|
||||||
{{- if or (empty .Values.cloud.metrics.endpoint) (or (empty .Values.cloud.metrics.username) (empty .Values.cloud.metrics.password)) -}}
|
{{- if empty .Values.cloud.metrics.secret -}}
|
||||||
{{- fail "if cloud.metrics is enabled then the endpoint, username and password have to be filled in" -}}
|
{{- fail "if cloud.metrics is enabled then the secret has to be filled in" -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
{{- if eq .Values.cloud.traces.enabled true -}}
|
{{- if eq .Values.cloud.traces.enabled true -}}
|
||||||
{{- if or (empty .Values.cloud.traces.endpoint) (or (empty .Values.cloud.traces.username) (empty .Values.cloud.traces.password)) -}}
|
{{- if empty .Values.cloud.traces.secret -}}
|
||||||
{{- fail "if cloud.traces is enabled then the endpoint, username and password have to be filled in" -}}
|
{{- fail "if cloud.traces is enabled then the secret has to be filled in" -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
@@ -37,3 +37,7 @@
|
|||||||
{{- if empty .Values.namespacesToMonitor -}}
|
{{- if empty .Values.namespacesToMonitor -}}
|
||||||
{{- fail "No namespaces have been specified in namespacesToMonitor" -}}
|
{{- fail "No namespaces have been specified in namespacesToMonitor" -}}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
|
|
||||||
|
{{- if empty .Values.metrics.retain -}}
|
||||||
|
{{- fail "All metrics will be collected, please specify some in metrics.retain" -}}
|
||||||
|
{{- end -}}
|
||||||
|
@@ -1,58 +1,226 @@
|
|||||||
|
# Specify the namespaces to monitor here
|
||||||
namespacesToMonitor:
|
namespacesToMonitor:
|
||||||
- loki
|
- loki
|
||||||
- mimir
|
# The name of the cluster where this will be installed
|
||||||
- tempo
|
clusterLabelValue: "meta-monitoring"
|
||||||
clusterName: "meta-monitoring" # TODO check if this can be derived
|
# Set to true to write logs, metrics or traces to Grafana Cloud
|
||||||
|
# The secrets have to be created first
|
||||||
local:
|
|
||||||
logs:
|
|
||||||
enabled: false
|
|
||||||
metrics:
|
|
||||||
enabled: false
|
|
||||||
traces:
|
|
||||||
enabled: false
|
|
||||||
minio:
|
|
||||||
enabled: false # This should be set to true if any of the previous is enabled
|
|
||||||
|
|
||||||
|
|
||||||
cloud:
|
cloud:
|
||||||
logs:
|
logs:
|
||||||
enabled: true
|
enabled: true
|
||||||
endpoint:
|
secret: "logs"
|
||||||
username:
|
|
||||||
password:
|
|
||||||
metrics:
|
metrics:
|
||||||
enabled: true
|
enabled: true
|
||||||
endpoint:
|
secret: "metrics"
|
||||||
username:
|
|
||||||
password:
|
|
||||||
traces:
|
traces:
|
||||||
enabled: true
|
enabled: true
|
||||||
endpoint:
|
secret: "traces"
|
||||||
username:
|
# Set to true for a local version of logs, metrics or traces
|
||||||
password:
|
local:
|
||||||
|
grafana:
|
||||||
global:
|
enabled: false
|
||||||
|
logs:
|
||||||
|
enabled: false
|
||||||
|
metrics:
|
||||||
|
enabled: false
|
||||||
|
traces:
|
||||||
|
enabled: false
|
||||||
minio:
|
minio:
|
||||||
rootUser: "rootuser"
|
enabled: false # This should be set to true if any of the previous is enabled
|
||||||
rootPassword: "rootpassword"
|
grafana:
|
||||||
|
version: 10.4.2
|
||||||
|
# Gateway ingress configuration
|
||||||
|
ingress:
|
||||||
|
# -- Specifies whether an ingress for the gateway should be created
|
||||||
|
enabled: true
|
||||||
|
# -- Ingress Class Name. MAY be required for Kubernetes versions >= 1.18
|
||||||
|
ingressClassName: ""
|
||||||
|
# -- Annotations for the gateway ingress
|
||||||
|
annotations: {}
|
||||||
|
# -- Labels for the gateway ingress
|
||||||
|
labels: {}
|
||||||
|
# -- Hosts configuration for the gateway ingress, passed through the `tpl` function to allow templating
|
||||||
|
hosts:
|
||||||
|
- host: monitoring.example.com
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
# -- pathType (e.g. ImplementationSpecific, Prefix, .. etc.) might also be required by some Ingress Controllers
|
||||||
|
# pathType: Prefix
|
||||||
|
# -- TLS configuration for the gateway ingress. Hosts passed through the `tpl` function to allow templating
|
||||||
|
#tls:
|
||||||
|
# - secretName: grafana-tls
|
||||||
|
# hosts:
|
||||||
|
# - monitoring.example.com
|
||||||
|
logs:
|
||||||
|
# Adding regexes here will add a stage.replace block for logs. For more information see
|
||||||
|
# https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block
|
||||||
|
piiRegexes: null # This example replaces the word after password with *****
|
||||||
|
# - expression: "password (\\\\S+)"
|
||||||
|
# source: "" # Empty uses the log message
|
||||||
|
# replace: "*****""
|
||||||
|
# The lines matching these will be kept in Loki
|
||||||
|
retain:
|
||||||
|
# This shows the queries
|
||||||
|
- caller=metrics.go
|
||||||
|
# This shows any errors
|
||||||
|
- level=error
|
||||||
|
# Log lines for delete requests
|
||||||
|
- delete request for user added
|
||||||
|
- Started processing delete request
|
||||||
|
- delete request for user marked as processed
|
||||||
|
# This shows the ingest requests and is very noisy. Uncomment to include.
|
||||||
|
# - caller=push.go
|
||||||
|
# Additional log lines to retain
|
||||||
|
extraLogs: []
|
||||||
|
metrics:
|
||||||
|
# The list of metrics to retain for logging dashboards
|
||||||
|
retain:
|
||||||
|
- agent_config_last_load_success_timestamp_seconds
|
||||||
|
- agent_config_last_load_successful
|
||||||
|
- agent_config_load_failures_total
|
||||||
|
- container_cpu_usage_seconds_total
|
||||||
|
- container_fs_writes_bytes_total
|
||||||
|
- container_memory_working_set_bytes
|
||||||
|
- container_network_receive_bytes_total
|
||||||
|
- container_network_transmit_bytes_total
|
||||||
|
- container_spec_cpu_period
|
||||||
|
- container_spec_cpu_quota
|
||||||
|
- container_spec_memory_limit_bytes
|
||||||
|
- cortex_ingester_flush_queue_length
|
||||||
|
- cortex_prometheus_rule_group_iterations_total
|
||||||
|
- cortex_prometheus_rule_evaluation_failures_total
|
||||||
|
- cortex_prometheus_rule_group_rules
|
||||||
|
- cortex_prometheus_rule_group_last_duration_seconds
|
||||||
|
- cortex_prometheus_rule_group_last_evaluation_timestamp_seconds
|
||||||
|
- cortex_prometheus_rule_group_iterations_missed_total
|
||||||
|
- go_gc_duration_seconds
|
||||||
|
- go_goroutines
|
||||||
|
- go_memstats_heap_inuse_bytes
|
||||||
|
- kubelet_volume_stats_used_bytes
|
||||||
|
- kubelet_volume_stats_capacity_bytes
|
||||||
|
- kube_deployment_created
|
||||||
|
- kube_persistentvolumeclaim_labels
|
||||||
|
- kube_pod_container_info
|
||||||
|
- kube_pod_container_resource_requests
|
||||||
|
- kube_pod_container_status_last_terminated_reason
|
||||||
|
- kube_pod_container_status_restarts_total
|
||||||
|
- loki_boltdb_shipper_compact_tables_operation_duration_seconds
|
||||||
|
- loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds
|
||||||
|
- loki_boltdb_shipper_retention_marker_count_total
|
||||||
|
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_bucket
|
||||||
|
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_count
|
||||||
|
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_sum
|
||||||
|
- loki_boltdb_shipper_retention_marker_table_processed_total
|
||||||
|
- loki_boltdb_shipper_request_duration_seconds_bucket
|
||||||
|
- loki_boltdb_shipper_request_duration_seconds_count
|
||||||
|
- loki_boltdb_shipper_request_duration_seconds_sum
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_bucket
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_count
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_sum
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_marker_files_current
|
||||||
|
- loki_boltdb_shipper_retention_sweeper_marker_file_processing_current_time
|
||||||
|
- loki_build_info
|
||||||
|
- loki_chunk_store_deduped_chunks_total
|
||||||
|
- loki_chunk_store_index_entries_per_chunk_bucket
|
||||||
|
- loki_chunk_store_index_entries_per_chunk_count
|
||||||
|
- loki_chunk_store_index_entries_per_chunk_sum
|
||||||
|
- loki_compactor_delete_requests_processed_total
|
||||||
|
- loki_compactor_delete_requests_received_total
|
||||||
|
- loki_compactor_deleted_lines
|
||||||
|
- loki_compactor_oldest_pending_delete_request_age_seconds
|
||||||
|
- loki_compactor_pending_delete_requests_count
|
||||||
|
- loki_discarded_samples_total
|
||||||
|
- loki_distributor_bytes_received_total
|
||||||
|
- loki_distributor_lines_received_total
|
||||||
|
- loki_distributor_structured_metadata_bytes_received_total
|
||||||
|
- loki_index_request_duration_seconds_count
|
||||||
|
- loki_ingester_chunk_age_seconds_bucket
|
||||||
|
- loki_ingester_chunk_age_seconds_count
|
||||||
|
- loki_ingester_chunk_age_seconds_sum
|
||||||
|
- loki_ingester_chunk_bounds_hours_bucket
|
||||||
|
- loki_ingester_chunk_bounds_hours_count
|
||||||
|
- loki_ingester_chunk_bounds_hours_sum
|
||||||
|
- loki_ingester_chunk_entries_bucket
|
||||||
|
- loki_ingester_chunk_entries_count
|
||||||
|
- loki_ingester_chunk_entries_sum
|
||||||
|
- loki_ingester_chunk_size_bytes_bucket
|
||||||
|
- loki_ingester_chunk_utilization_bucket
|
||||||
|
- loki_ingester_chunk_utilization_sum
|
||||||
|
- loki_ingester_chunks_flushed_total
|
||||||
|
- loki_ingester_flush_queue_length
|
||||||
|
- loki_ingester_memory_chunks
|
||||||
|
- loki_ingester_memory_streams
|
||||||
|
- loki_ingester_streams_created_total
|
||||||
|
- loki_request_duration_seconds_bucket
|
||||||
|
- loki_request_duration_seconds_count
|
||||||
|
- loki_request_duration_seconds_sum
|
||||||
|
- loki_ruler_wal_appender_ready
|
||||||
|
- loki_ruler_wal_disk_size
|
||||||
|
- loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds
|
||||||
|
- loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds
|
||||||
|
- loki_ruler_wal_prometheus_remote_storage_samples_pending
|
||||||
|
- loki_ruler_wal_prometheus_remote_storage_samples_total
|
||||||
|
- loki_ruler_wal_samples_appended_total
|
||||||
|
- loki_ruler_wal_storage_created_series_total
|
||||||
|
- loki_write_batch_retries_total
|
||||||
|
- loki_write_dropped_bytes_total
|
||||||
|
- loki_write_dropped_entries_total
|
||||||
|
- loki_write_sent_bytes_total
|
||||||
|
- loki_write_sent_entries_total
|
||||||
|
- node_disk_read_bytes_total
|
||||||
|
- node_disk_written_bytes_total
|
||||||
|
- promtail_custom_bad_words_total
|
||||||
|
# Additional metrics to retain
|
||||||
|
extraMetrics: []
|
||||||
|
# Set enabled = true to add the default logs dashboards to the local Grafana
|
||||||
|
dashboards:
|
||||||
|
logs:
|
||||||
|
enabled: true
|
||||||
|
kubeStateMetrics:
|
||||||
|
# Scrape https://github.com/kubernetes/kube-state-metrics by default
|
||||||
|
enabled: true
|
||||||
|
# This endpoint is created when the helm chart from
|
||||||
|
# https://artifacthub.io/packages/helm/prometheus-community/kube-state-metrics/
|
||||||
|
# is used. Change this if kube-state-metrics is installed somewhere else.
|
||||||
|
endpoint: kube-state-metrics.kube-state-metrics.svc.cluster.local:8080
|
||||||
# The following are configuration for the dependencies.
|
# The following are configuration for the dependencies.
|
||||||
# These should not be changed.
|
# These should usually not be changed.
|
||||||
|
|
||||||
loki:
|
loki:
|
||||||
loki:
|
loki:
|
||||||
auth_enabled: false
|
auth_enabled: false
|
||||||
|
schemaConfig:
|
||||||
|
configs:
|
||||||
|
- from: 2024-03-29
|
||||||
|
store: tsdb
|
||||||
|
object_store: s3
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
storage:
|
storage:
|
||||||
type: "s3"
|
type: "s3"
|
||||||
s3:
|
s3:
|
||||||
endpoint: "meta-minio.meta.svc:9000"
|
|
||||||
access_key_id: rootuser
|
|
||||||
secret_access_key: rootpassword
|
|
||||||
insecure: true
|
insecure: true
|
||||||
|
s3ForcePathStyle: true
|
||||||
bucketNames:
|
bucketNames:
|
||||||
chunks: loki-chunks
|
chunks: loki-chunks
|
||||||
ruler: loki-ruler
|
ruler: loki-ruler
|
||||||
|
structuredConfig:
|
||||||
|
common:
|
||||||
|
storage:
|
||||||
|
s3:
|
||||||
|
access_key_id: "${rootUser}"
|
||||||
|
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
||||||
|
secret_access_key: "${rootPassword}"
|
||||||
|
compactor:
|
||||||
|
retention_enabled: true
|
||||||
|
delete_request_store: s3
|
||||||
|
limits_config:
|
||||||
|
retention_period: 30d
|
||||||
|
lokiCanary:
|
||||||
|
enabled: false
|
||||||
|
test:
|
||||||
|
enabled: false
|
||||||
monitoring:
|
monitoring:
|
||||||
dashboards:
|
dashboards:
|
||||||
enabled: false
|
enabled: false
|
||||||
@@ -66,44 +234,85 @@ loki:
|
|||||||
installOperator: false
|
installOperator: false
|
||||||
lokiCanary:
|
lokiCanary:
|
||||||
enabled: false
|
enabled: false
|
||||||
test:
|
write:
|
||||||
enabled: false
|
extraArgs:
|
||||||
|
- "-config.expand-env=true"
|
||||||
grafana-agent:
|
extraEnvFrom:
|
||||||
agent:
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
|
read:
|
||||||
|
extraArgs:
|
||||||
|
- "-config.expand-env=true"
|
||||||
|
extraEnvFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
|
backend:
|
||||||
|
extraArgs:
|
||||||
|
- "-config.expand-env=true"
|
||||||
|
extraEnvFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
|
alloy:
|
||||||
|
alloy:
|
||||||
|
clustering:
|
||||||
|
enabled: true
|
||||||
configMap:
|
configMap:
|
||||||
create: false
|
create: false
|
||||||
name: "agent-configmap"
|
name: "agent-configmap"
|
||||||
key: 'config.river'
|
key: 'config.river'
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: '1000m'
|
||||||
|
memory: '600Mi'
|
||||||
|
limits:
|
||||||
|
memory: '4Gi'
|
||||||
|
extraPorts:
|
||||||
|
- name: "otel"
|
||||||
|
port: 4317
|
||||||
|
targetPort: 4317
|
||||||
|
protocol: "TCP"
|
||||||
|
- name: "thrifthttp"
|
||||||
|
port: 14268
|
||||||
|
targetPort: 14268
|
||||||
|
protocol: "TCP"
|
||||||
|
controller:
|
||||||
|
type: "statefulset"
|
||||||
|
autoscaling:
|
||||||
|
enabled: true
|
||||||
|
minReplicas: 3
|
||||||
|
maxReplicas: 30
|
||||||
|
targetMemoryUtilizationPercentage: 90
|
||||||
|
targetCPUUtilizationPercentage: 90
|
||||||
mimir-distributed:
|
mimir-distributed:
|
||||||
minio:
|
minio:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
global:
|
||||||
|
extraEnvFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
mimir:
|
mimir:
|
||||||
structuredConfig:
|
structuredConfig:
|
||||||
alertmanager_storage:
|
alertmanager_storage:
|
||||||
s3:
|
s3:
|
||||||
bucket_name: mimir-ruler
|
bucket_name: mimir-ruler
|
||||||
access_key_id: "{{ .Values.global.minio.rootUser }}"
|
|
||||||
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
|
||||||
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
|
||||||
insecure: true
|
|
||||||
blocks_storage:
|
blocks_storage:
|
||||||
backend: s3
|
backend: s3
|
||||||
s3:
|
s3:
|
||||||
bucket_name: mimir-tsdb
|
bucket_name: mimir-tsdb
|
||||||
access_key_id: "{{ .Values.global.minio.rootUser }}"
|
|
||||||
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
|
||||||
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
|
||||||
insecure: true
|
|
||||||
ruler_storage:
|
ruler_storage:
|
||||||
s3:
|
s3:
|
||||||
bucket_name: mimir-ruler
|
bucket_name: mimir-ruler
|
||||||
access_key_id: "{{ .Values.global.minio.rootUser }}"
|
common:
|
||||||
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
storage:
|
||||||
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
|
backend: s3
|
||||||
insecure: true
|
s3:
|
||||||
|
bucket_name: mimir-ruler
|
||||||
|
access_key_id: "${rootUser}"
|
||||||
|
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
||||||
|
secret_access_key: "${rootPassword}"
|
||||||
|
insecure: true
|
||||||
|
limits:
|
||||||
|
compactor_blocks_retention_period: 30d
|
||||||
tempo-distributed:
|
tempo-distributed:
|
||||||
tempo:
|
tempo:
|
||||||
structuredConfig:
|
structuredConfig:
|
||||||
@@ -113,19 +322,47 @@ tempo-distributed:
|
|||||||
s3:
|
s3:
|
||||||
bucket: tempo
|
bucket: tempo
|
||||||
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
|
||||||
access_key: "{{ .Values.global.minio.rootUser }}"
|
access_key: "${rootUser}"
|
||||||
secret_key: "{{ .Values.global.minio.rootPassword }}"
|
secret_key: "${rootPassword}"
|
||||||
insecure: true
|
insecure: true
|
||||||
|
distributor:
|
||||||
|
extraArgs:
|
||||||
|
- "-config.expand-env=true"
|
||||||
|
extraEnvFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
|
ingester:
|
||||||
|
extraArgs:
|
||||||
|
- "-config.expand-env=true"
|
||||||
|
extraEnvFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
|
compactor:
|
||||||
|
extraArgs:
|
||||||
|
- "-config.expand-env=true"
|
||||||
|
extraEnvFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
|
querier:
|
||||||
|
extraArgs:
|
||||||
|
- "-config.expand-env=true"
|
||||||
|
extraEnvFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
|
queryFrontend:
|
||||||
|
extraArgs:
|
||||||
|
- "-config.expand-env=true"
|
||||||
|
extraEnvFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: "minio"
|
||||||
traces:
|
traces:
|
||||||
otlp:
|
otlp:
|
||||||
http:
|
http:
|
||||||
enabled: true
|
enabled: true
|
||||||
grpc:
|
grpc:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
||||||
minio:
|
minio:
|
||||||
rootUser: rootuser
|
existingSecret: "minio"
|
||||||
rootPassword: rootpassword
|
|
||||||
buckets:
|
buckets:
|
||||||
- name: loki-chunks
|
- name: loki-chunks
|
||||||
policy: none
|
policy: none
|
||||||
@@ -150,4 +387,4 @@ minio:
|
|||||||
cpu: 100m
|
cpu: 100m
|
||||||
memory: 128Mi
|
memory: 128Mi
|
||||||
# Changed the mc config path to '/tmp' from '/etc' as '/etc' is only writable by root and OpenShift will not permit this.
|
# Changed the mc config path to '/tmp' from '/etc' as '/etc' is only writable by root and OpenShift will not permit this.
|
||||||
configPathmc: "/tmp/minio/mc/"
|
configPathmc: "/tmp/minio/mc/"
|
||||||
|
10
docs/create_new_release.md
Normal file
10
docs/create_new_release.md
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Create a new release
|
||||||
|
|
||||||
|
1. Update the version field in charts/meta-monitoring/Chart.yaml in a new PR. Merge this PR if approved.
|
||||||
|
|
||||||
|
2. On the [Actions tab](https://github.com/grafana/meta-monitoring-chart/actions):
|
||||||
|
- Select `Release Helm chart` in the workflows on the left
|
||||||
|
- Click the `Run workflow` button
|
||||||
|
- Leave the `main` branch as is
|
||||||
|
- Click the green `Run workflow` button
|
||||||
|
|
@@ -1,27 +1,193 @@
|
|||||||
# Install this chart
|
# Install this chart
|
||||||
|
|
||||||
|
## Preparation for Cloud mode (preferred)
|
||||||
|
|
||||||
|
1. Use an existing Grafana Cloud account or setup a new one. Then create an access token:
|
||||||
|
|
||||||
|
1. In Grafana go to Administration -> Users and Access -> Cloud access policies.
|
||||||
|
|
||||||
|
1. Click `Create access policy`.
|
||||||
|
|
||||||
|
1. Fill in the `Display name` field and check the `Write` check box for metrics, logs and traces. Then click `Create`.
|
||||||
|
|
||||||
|
1. On the newly created access policy click `Add token`.
|
||||||
|
|
||||||
|
1. Fill in the `Token name` field and click `Create`. Make a copy of the token as it will be used later on.
|
||||||
|
|
||||||
1. Create the meta namespace
|
1. Create the meta namespace
|
||||||
|
|
||||||
```
|
```
|
||||||
kubectl create namespace meta
|
kubectl create namespace meta
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml).
|
1. Create secrets with credentials and the endpoint when sending logs, metrics or traces to Grafana Cloud.
|
||||||
|
|
||||||
|
```
|
||||||
|
kubectl create secret generic logs -n meta \
|
||||||
|
--from-literal=username=<logs username> \
|
||||||
|
--from-literal=password=<token> \
|
||||||
|
--from-literal=endpoint='https://logs-prod-us-central1.grafana.net/loki/api/v1/push'
|
||||||
|
|
||||||
|
kubectl create secret generic metrics -n meta \
|
||||||
|
--from-literal=username=<metrics username> \
|
||||||
|
--from-literal=password=<token> \
|
||||||
|
--from-literal=endpoint='https://prometheus-us-central1.grafana.net/api/prom/push'
|
||||||
|
|
||||||
|
kubectl create secret generic traces -n meta \
|
||||||
|
--from-literal=username=<OTLP instance ID> \
|
||||||
|
--from-literal=password=<token> \
|
||||||
|
--from-literal=endpoint='https://otlp-gateway-prod-us-east-0.grafana.net/otlp'
|
||||||
|
```
|
||||||
|
|
||||||
|
The logs, metrics and traces usernames are the `User / Username / Instance IDs` of the Loki, Prometheus/Mimir and OpenTelemetry instances in Grafana Cloud. From `Home` in Grafana click on `Stacks`. Then go to the `Details` pages of Loki and Prometheus/Mimir. For OpenTelemetry go to the `Configure` page.
|
||||||
|
|
||||||
|
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). Fill in the names of the secrets created above as needed. An example minimal values.yaml looks like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
namespacesToMonitor:
|
||||||
|
- loki
|
||||||
|
|
||||||
|
cloud:
|
||||||
|
logs:
|
||||||
|
enabled: true
|
||||||
|
secret: "logs"
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
secret: "metrics"
|
||||||
|
traces:
|
||||||
|
enabled: true
|
||||||
|
secret: "traces"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Preparation for Local mode
|
||||||
|
|
||||||
|
1. Create the meta namespace
|
||||||
|
|
||||||
|
```
|
||||||
|
kubectl create namespace meta
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Create a secret named `minio` with the user and password for the local Minio:
|
||||||
|
|
||||||
|
```
|
||||||
|
kubectl create secret generic minio -n meta \
|
||||||
|
--from-literal=rootPassword=<password> \
|
||||||
|
--from-literal=rootUser=<user>
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). An example minimal values.yaml looks like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
namespacesToMonitor:
|
||||||
|
- loki
|
||||||
|
|
||||||
|
cloud:
|
||||||
|
logs:
|
||||||
|
enabled: false
|
||||||
|
metrics:
|
||||||
|
enabled: false
|
||||||
|
traces:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
local:
|
||||||
|
grafana:
|
||||||
|
enabled:true
|
||||||
|
logs:
|
||||||
|
enabled: true
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
traces:
|
||||||
|
enabled: true
|
||||||
|
minio:
|
||||||
|
enabled: true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installing the chart
|
||||||
|
|
||||||
|
1. Add the repo
|
||||||
|
|
||||||
|
```
|
||||||
|
helm repo add grafana https://grafana.github.io/helm-charts
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Fetch the latest charts from the grafana repo
|
||||||
|
|
||||||
|
```
|
||||||
|
helm repo update grafana
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
1. Install this helm chart
|
1. Install this helm chart
|
||||||
|
|
||||||
```
|
```
|
||||||
helm install -n meta -f values.yaml meta ./charts/meta-monitoring
|
helm install -n meta -f values.yaml meta grafana/meta-monitoring
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Upgrade
|
1. Upgrade
|
||||||
|
|
||||||
```
|
```
|
||||||
helm upgrade --install -f values.yaml -n meta meta ./charts/meta-monitoring
|
helm upgrade --install -f values.yaml -n meta meta grafana/meta-monitoring
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Delete this chart:
|
1. Delete this chart:
|
||||||
|
|
||||||
```
|
```
|
||||||
helm delete -n meta meta
|
helm delete -n meta meta
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Installing the dashboards and rules on Grafana Cloud
|
||||||
|
|
||||||
|
## Installing the dashboards on Grafana Cloud
|
||||||
|
|
||||||
|
Only the files for the application monitored have to be copied. When monitoring Loki import dashboard files starting with 'loki-'.
|
||||||
|
|
||||||
|
For each of the dashboard files in charts/meta-monitoring/src/dashboards folder do the following:
|
||||||
|
|
||||||
|
1. Click on 'Dashboards' in Grafana
|
||||||
|
|
||||||
|
1. Click on the 'New` button and select 'Import'
|
||||||
|
|
||||||
|
1. Drop the dashboard file to the 'Upload dashboard JSON file' drop area
|
||||||
|
|
||||||
|
1. Click 'Import'
|
||||||
|
|
||||||
|
## Installing the rules on Grafana Cloud
|
||||||
|
|
||||||
|
1. Select the rules files in charts/meta-monitoring/src/rules for the application to monitor. When monitoring Loki use loki-rules.yaml.
|
||||||
|
|
||||||
|
1. Install mimirtool as per the [instructions](https://grafana.com/docs/mimir/latest/manage/tools/mimirtool/)
|
||||||
|
|
||||||
|
1. Create an access policy with Read and Write permission for Rules. Also create a token and record the token.
|
||||||
|
|
||||||
|
1. Get your cloud Prometheus endpoint and Instance ID from the `Prometheus` page in `Stacks`.
|
||||||
|
|
||||||
|
1. Use them to load the rules using mimirtool as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
mimirtool rules load --address=<your_cloud_prometheus_endpoint> --id=<your_instance_id> --key=<your_cloud_access_policy_token> *.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
1. To check the rules you have uploaded run:
|
||||||
|
|
||||||
|
```
|
||||||
|
mimirtool rules print --address=<your_cloud_prometheus_endpoint> --id=<your_instance_id> --key=<your_cloud_access_policy_token>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configure Loki to send traces
|
||||||
|
|
||||||
|
1. In the Loki config enable tracing:
|
||||||
|
|
||||||
|
```
|
||||||
|
loki:
|
||||||
|
tracing:
|
||||||
|
enabled: true
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Add the following environment variables to your Loki binaries. When using the Loki Helm chart these can be added using the `extraEnv` setting for the Loki components.
|
||||||
|
|
||||||
|
1. JAEGER_ENDPOINT: http address of the mmc-alloy service installed by the meta-monitoring chart, for example "http://mmc-alloy:14268/api/traces"
|
||||||
|
1. JAEGER_AGENT_TAGS: extra tags you would like to add to the spans, for example 'cluster="abc",namespace="def"'
|
||||||
|
1. JAEGER_SAMPLER_TYPE: the sampling strategy, for example to sample all use 'const' with a value of 1 for the next environment variable
|
||||||
|
1. JAEGER_SAMPLER_PARAM: 1
|
||||||
|
|
||||||
|
1. If Loki is installed in a different namespace you can create an [ExternalName service](https://kubernetes.io/docs/concepts/services-networking/service/#externalname) in Kubernetes to point to the mmc-alloy service in the meta monitoring namespace
|
||||||
|
20
scripts/clone_loki_mixin.sh
Executable file
20
scripts/clone_loki_mixin.sh
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
clean_up() {
|
||||||
|
test -d "$tmp_dir" && rm -fr "$tmp_dir"
|
||||||
|
}
|
||||||
|
|
||||||
|
here=${PWD}
|
||||||
|
|
||||||
|
tmp_dir=$( mktemp -d -t my-script )
|
||||||
|
cd $tmp_dir
|
||||||
|
|
||||||
|
echo "Cloning Loki"
|
||||||
|
git clone --filter=blob:none --no-checkout "https://github.com/grafana/loki"
|
||||||
|
cd loki
|
||||||
|
git sparse-checkout init --cone
|
||||||
|
git checkout main
|
||||||
|
git sparse-checkout set production/loki-mixin
|
||||||
|
|
||||||
|
echo "Copying production/loki-mixin to ${here}"
|
||||||
|
cp -r production ${here}
|
18
scripts/mixin-meta-monitoring.libsonnet
Normal file
18
scripts/mixin-meta-monitoring.libsonnet
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
(import 'dashboards.libsonnet') +
|
||||||
|
(import 'alerts.libsonnet') +
|
||||||
|
(import 'recording_rules.libsonnet') + {
|
||||||
|
grafanaDashboardFolder: 'Loki Meta Monitoring',
|
||||||
|
|
||||||
|
_config+:: {
|
||||||
|
internal_components: false,
|
||||||
|
|
||||||
|
// The Meta Monitoring helm chart uses Grafana Alloy instead of promtail
|
||||||
|
promtail+: {
|
||||||
|
enabled: false,
|
||||||
|
},
|
||||||
|
|
||||||
|
meta_monitoring+: {
|
||||||
|
enabled: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
9
tools/kind.config
Normal file
9
tools/kind.config
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
kind: Cluster
|
||||||
|
apiVersion: kind.x-k8s.io/v1alpha4
|
||||||
|
name: meta
|
||||||
|
nodes:
|
||||||
|
- role: control-plane
|
||||||
|
- role: worker
|
||||||
|
- role: worker
|
||||||
|
- role: worker
|
||||||
|
|
Reference in New Issue
Block a user