Compare commits

..

No commits in common. "main" and "remove_non_loki_dashboards" have entirely different histories.

62 changed files with 6981 additions and 6604 deletions

View File

@ -1,19 +0,0 @@
apiVersion: kind.x-k8s.io/v1alpha4
kind: Cluster
nodes:
- role: control-plane
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
controllerManager:
extraArgs:
bind-address: 0.0.0.0
secure-port: "10257"
scheduler:
extraArgs:
bind-address: 0.0.0.0
secure-port: "10259"
- |
kind: KubeProxyConfiguration
metricsBindAddress: 0.0.0.0:10249
- role: worker

View File

@ -1,30 +0,0 @@
name: Bump grafana version specified in the values.yaml
sources:
latestGrafanaRelease:
name: Get latest grafana release on Github
kind: githubrelease
spec:
owner: grafana
repository: grafana
token: '{{ requiredEnv "UPDATECLI_GITHUB_TOKEN" }}'
versionfilter:
kind: latest
transformers:
- trimprefix: "v"
conditions:
grafanaImagePublished:
name: Ensure the latest Grafana is published on DockerHub
kind: dockerimage
source-id: latestGrafanaRelease
spec:
image: "grafana/grafana"
targets:
grafana:
name: Update Grafana version in values.yaml
kind: helmchart
spec:
file: values.yaml
key: $.grafana.version
name: charts/meta-monitoring
versionincrement: none
sourceid: latestGrafanaRelease

View File

@ -16,95 +16,9 @@ env:
UPDATECLI_GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
jobs:
updateVersions:
name: Update the subcharts
updateLoki:
name: Update the Loki subchart
runs-on: "ubuntu-latest"
permissions:
contents: write
id-token: write
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install Updatecli
uses: updatecli/updatecli-action@v2
- name: Run Updatecli for Loki
id: update-loki
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/loki.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- name: Run Updatecli for Alloy
id: update-grafana-alloy
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/alloy.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- name: Run Updatecli for Mimir
id: update-mimir-distributed
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/mimir-distributed.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- name: Run Updatecli for Tempo
id: update-tempo-distributed
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/tempo-distributed.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- name: Run Updatecli for Minio
id: update-minio
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/minio.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- id: get-secrets
uses: grafana/shared-workflows/actions/get-vault-secrets@main
with:
# Secrets placed in the ci/repo/grafana/<repo>/<path> path in Vault
repo_secrets: |
APP_ID=github-app:app-id
PRIVATE_KEY=github-app:private-key
- uses: actions/create-github-app-token@v1
id: app-token
with:
app-id: ${{ env.APP_ID }}
private-key: ${{ env.PRIVATE_KEY }}
- name: Create pull request
if: steps.update-loki.outputs.changed == 'true' || steps.update-grafana-alloy.outputs.changed == 'true' || steps.update-mimir-distributed.outputs.changed == 'true' || steps.update-tempo-distributed.outputs.changed == 'true' || steps.update-minio.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5
with:
title: "[dependency] Update the subcharts"
body: "Updates the subcharts"
base: main
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
committer: "GitHub <noreply@github.com>"
commit-message: Update dependencies
labels: dependencies
branch: chore/update-dependencies
delete-branch: true
team-reviewers: "@grafana/loki-squad"
token: ${{ steps.app-token.outputs.token }}
updateGrafana:
name: Update the Grafana version
runs-on: "ubuntu-latest"
permissions:
contents: write
id-token: write
steps:
- name: Checkout
uses: actions/checkout@v2
@ -113,39 +27,151 @@ jobs:
uses: updatecli/updatecli-action@v2
- name: Run Updatecli
id: update-grafana
id: update-loki
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/grafana.yaml
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/loki.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- id: get-secrets
uses: grafana/shared-workflows/actions/get-vault-secrets@main
with:
# Secrets placed in the ci/repo/grafana/<repo>/<path> path in Vault
repo_secrets: |
APP_ID=github-app:app-id
PRIVATE_KEY=github-app:private-key
- uses: actions/create-github-app-token@v1
id: app-token
with:
app-id: ${{ env.APP_ID }}
private-key: ${{ env.PRIVATE_KEY }}
- name: Create pull request
if: steps.update-grafana.outputs.changed == 'true'
if: steps.update-loki.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5
with:
title: "[dependency] Update the Grafana version"
body: "Updates the Grafana version"
title: "[dependency] Update the Loki subchart"
body: "Updates the Loki subchart"
base: main
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
committer: "GitHub <noreply@github.com>"
commit-message: Update Grafana version
commit-message: Update loki
labels: dependencies
branch: chore/update-loki
delete-branch: true
updateGrafanaAlloy:
name: Update the Grafana Alloy subchart
runs-on: "ubuntu-latest"
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install Updatecli
uses: updatecli/updatecli-action@v2
- name: Run Updatecli
id: update-grafana-alloy
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/alloy.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- name: Create pull request
if: steps.update-grafana-alloy.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5
with:
title: "[dependency] Update the Grafana Alloy subchart"
body: "Updates the Grafana Alloy subchart"
base: main
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
committer: "GitHub <noreply@github.com>"
commit-message: Update Grafana Alloy
labels: dependencies
branch: chore/update-grafana-alloy
delete-branch: true
updateMimirDistributed:
name: Update the Mimir Distributed subchart
runs-on: "ubuntu-latest"
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install Updatecli
uses: updatecli/updatecli-action@v2
- name: Run Updatecli
id: update-mimir-distributed
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/mimir-distributed.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- name: Create pull request
if: steps.update-mimir-distributed.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5
with:
title: "[dependency] Update the Mimir Distributed subchart"
body: "Updates the Mimir Distributed subchart"
base: main
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
committer: "GitHub <noreply@github.com>"
commit-message: Update Mimir Distributed
labels: dependencies
branch: chore/update-mimir-distributed
delete-branch: true
updateTempoDistributed:
name: Update the Tempo Distributed subchart
runs-on: "ubuntu-latest"
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install Updatecli
uses: updatecli/updatecli-action@v2
- name: Run Updatecli
id: update-tempo-distributed
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/tempo-distributed.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- name: Create pull request
if: steps.update-tempo-distributed.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5
with:
title: "[dependency] Update the Tempo Distributed subchart"
body: "Updates the tempo Distributed subchart"
base: main
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
committer: "GitHub <noreply@github.com>"
commit-message: Update Tempo Distributed
labels: dependencies
branch: chore/update-tempo-distributed
delete-branch: true
updateMinio:
name: Update the Minio subchart
runs-on: "ubuntu-latest"
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install Updatecli
uses: updatecli/updatecli-action@v2
- name: Run Updatecli
id: update-minio
run: |
updatecli apply --config ${UPDATECLI_CONFIG_DIR}/minio.yaml
if ! git diff --exit-code > /dev/null; then
echo "changed=true" >> "${GITHUB_OUTPUT}"
fi
- name: Create pull request
if: steps.update-minio.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5
with:
title: "[dependency] Update the Minio subchart"
body: "Updates the Minio subchart"
base: main
author: "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
committer: "GitHub <noreply@github.com>"
commit-message: Update minio
labels: dependencies
branch: chore/update-minio
delete-branch: true
team-reviewers: "@grafana/loki-squad"
token: ${{ steps.app-token.outputs.token }}

View File

@ -1,7 +1,6 @@
---
name: helm-ci
on:
workflow_dispatch:
pull_request:
paths:
- "charts/meta-monitoring/**"
@ -25,7 +24,7 @@ jobs:
# runs-on: ubuntu-latest
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# uses: actions/checkout@v3
# with:
# fetch-depth: 0
@ -39,10 +38,10 @@ jobs:
# - name: Set up Python
# uses: actions/setup-python@v4
# with:
# python-version: 3.9
# python-version: 3.7
# - name: Set up chart-testing
# uses: helm/chart-testing-action@v2
# uses: helm/chart-testing-action@v2.4.0
# - name: Run chart-testing (list-changed)
# id: list-changed
@ -56,10 +55,10 @@ jobs:
# run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false
# - name: Create kind cluster
# uses: helm/kind-action@v1
# uses: helm/kind-action@v1.8.0
# if: steps.list-changed.outputs.changed == 'true'
# with:
# config: "${{ github.workspace }}/.github/configs/cluster-config.yaml"
# config: tools/kind.config
# - name: Run chart-testing (install)
# run: |

View File

@ -1,6 +1,8 @@
# meta-monitoring-chart
This is a meta-monitoring chart for Loki, specifically Loki installed via the Loki helm chart.
This is a meta-monitoring chart for Loki.
Note that this is pre-production software at the moment.
## Local and cloud modes
@ -9,15 +11,19 @@ to small Loki, Mimir and Tempo installations running in the meta-monitoring name
![local mode](docs/images/Meta%20monitoring%20local.png)
To enable local mode set `local.<logs|metrics|traces>.enabled` to true.
In the cloud mode the logs, metrics and/or traces are sent to Grafana Cloud.
![cloud mode](docs/images/Meta%20monitoring%20cloud.png)
Both modes can be enabled at the same time. Cloud mode is preferred.
To enable cloud mode set `cloud.<logs|metrics|traces>.enabled` to true. The `endpoint`, `username` and `password` settings for your Grafana Cloud logs, metrics and traces instances have to be filled in as well.
Both modes can be enabled at the same time.
## Installation
For more instructions including how to install the chart go to the [installation](docs/installation.md) page.
For more instructions including how to update the chart go to the [installation](docs/installation.md) page.
## Supported features
@ -27,7 +33,10 @@ For more instructions including how to install the chart go to the [installation
- Specify PII regexes that are applied to logs before they are sent to Loki (cloud or local). The capture group in the regex is replaced with *****.
- a Grafana instance is installed (when local mode is used) with the relevant datasources installed. The following dashboards are installed:
- logs dashboards
- Alloy dashboards
- metrics dashboards
- traces dashboards
- agent dashboards
- Retention is set to 24 hours
Most of these features are enabled by default. See the values.yaml file for how to enable/disable them.
@ -35,7 +44,8 @@ Most of these features are enabled by default. See the values.yaml file for how
- This has not been tested on Openshift yet.
- The underlying Loki, Mimir and Tempo are at the default size installed by the Helm chart. This might need changing when monitoring bigger Loki, Mimir or Tempo installations.
- MinIO is used as storage for the local mode at the moment with a limited retention. At the moment this chart cannot be used for monitoring over longer periods.
- MinIO is used as storage at the moment with a limited retention. At the moment this chart cannot be used for monitoring over longer periods.
- Agent self monitoring is not done at the moment.
## Developer help topics

View File

@ -1,18 +1,18 @@
dependencies:
- name: loki
repository: https://grafana.github.io/helm-charts
version: 6.29.0
version: 6.3.4
- name: alloy
repository: https://grafana.github.io/helm-charts
version: 0.12.5
version: 0.1.1
- name: mimir-distributed
repository: https://grafana.github.io/helm-charts
version: 5.6.0
version: 5.3.0
- name: tempo-distributed
repository: https://grafana.github.io/helm-charts
version: 1.33.0
version: 1.9.4
- name: minio
repository: https://charts.min.io
version: 5.4.0
digest: sha256:5225a03d9003384639f5d43b1971126371269347f16f221b7aed377ab85d71be
generated: "2025-03-27T07:03:11.17404081Z"
version: 5.2.0
digest: sha256:33c7285f1b517a9de4c15a9cb5e32478e8fd07cba3601fa93f03f4925d792f04
generated: "2024-04-29T07:02:49.713859154Z"

View File

@ -13,7 +13,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.3.0
version: 0.0.2
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
@ -22,20 +22,20 @@ appVersion: "0.0.1"
dependencies:
- name: loki
repository: https://grafana.github.io/helm-charts
version: 6.29.0
version: 6.3.4
condition: local.logs.enabled
- name: alloy
repository: https://grafana.github.io/helm-charts
version: 0.12.5
version: 0.1.1
- name: mimir-distributed
repository: https://grafana.github.io/helm-charts
version: 5.6.0
version: 5.3.0
condition: local.metrics.enabled
- name: tempo-distributed
repository: https://grafana.github.io/helm-charts
version: 1.33.0
version: 1.9.4
condition: local.traces.enabled
- name: minio
repository: https://charts.min.io
version: 5.4.0
version: 5.2.0
condition: local.minio.enabled

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,786 @@
{
"annotations": {
"list": [ ]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"links": [ ],
"refresh": "30s",
"rows": [
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 12,
"stack": false,
"steppedLine": false,
"styles": [
{
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
},
{
"alias": "Count",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #A",
"thresholds": [ ],
"type": "hidden",
"unit": "short"
},
{
"alias": "Uptime",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #B",
"thresholds": [ ],
"type": "number",
"unit": "short"
},
{
"alias": "Container",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "container",
"thresholds": [ ],
"type": "number",
"unit": "short"
},
{
"alias": "Pod",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "pod",
"thresholds": [ ],
"type": "number",
"unit": "short"
},
{
"alias": "Version",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "version",
"thresholds": [ ],
"type": "number",
"unit": "short"
},
{
"alias": "",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"pattern": "/.*/",
"thresholds": [ ],
"type": "string",
"unit": "short"
}
],
"targets": [
{
"expr": "count by (pod, container, version) (agent_build_info{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 10
},
{
"expr": "max by (pod, container) (time() - process_start_time_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Agent Stats",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"transform": "table",
"type": "table",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Agent Stats",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])) by (pod, scrape_job) * 1e3",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}/{{scrape_job}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Target Sync",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 3,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (pod) (prometheus_sd_discovered_targets{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Targets",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Prometheus Discovery",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(prometheus_target_interval_length_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])\n/\nrate(prometheus_target_interval_length_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])\n* 1e3\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}} {{interval}} configured",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Average Scrape Interval Duration",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 5,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "exceeded sample limit: {{job}}",
"legendLink": null,
"step": 10
},
{
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "duplicate timestamp: {{job}}",
"legendLink": null,
"step": 10
},
{
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "out of bounds: {{job}}",
"legendLink": null,
"step": 10
},
{
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "out of order: {{job}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Scrape failures",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job, instance_group_name) (rate(agent_wal_samples_appended_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{job}} {{instance_group_name}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Appended Samples",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Prometheus Retrieval",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [
"grafana-agent-mixin"
],
"templating": {
"list": [
{
"current": {
"text": "default",
"value": "default"
},
"hide": 0,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "cluster",
"multi": true,
"name": "cluster",
"options": [ ],
"query": "label_values(agent_build_info, cluster)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "namespace",
"multi": true,
"name": "namespace",
"options": [ ],
"query": "label_values(agent_build_info, namespace)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "container",
"multi": true,
"name": "container",
"options": [ ],
"query": "label_values(agent_build_info, container)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": "grafana-agent-.*",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "pod",
"multi": true,
"name": "pod",
"options": [ ],
"query": "label_values(agent_build_info{container=~\"$container\"}, pod)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Agent",
"uid": "",
"version": 0
}

File diff suppressed because it is too large Load Diff

View File

@ -1,540 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
},
{
"datasource": "$loki_datasource",
"enable": true,
"expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"",
"iconColor": "rgba(0, 211, 255, 1)",
"instant": false,
"name": "Deployments",
"titleFormat": "{{cluster}}/{{namespace}}"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 27,
"links": [
{
"icon": "doc",
"targetBlank": true,
"title": "Documentation",
"tooltip": "Clustering documentation",
"type": "link",
"url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode"
},
{
"asDropdown": true,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"alloy-mixin"
],
"targetBlank": false,
"title": "Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"datasource": "${datasource}",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 8,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "count(cluster_node_info{cluster=\"$cluster\", namespace=\"$namespace\"})",
"instant": true,
"legendFormat": "__auto",
"range": false,
"refId": "A"
}
],
"title": "Nodes",
"type": "stat"
},
{
"datasource": "${datasource}",
"description": "Nodes info.\n",
"fieldConfig": {
"defaults": {
"custom": {
"align": "auto",
"cellOptions": {
"type": "auto"
},
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Dashboard"
},
"properties": [
{
"id": "mappings",
"value": [
{
"options": {
"1": {
"index": 0,
"text": "Link"
}
},
"type": "value"
}
]
},
{
"id": "links",
"value": [
{
"targetBlank": false,
"title": "Detail dashboard for node",
"url": "/d/4047e755d822da63c8158cde32ae4dce/alloy-cluster-node?var-instance=${__data.fields.instance}&var-datasource=${datasource}&var-loki_datasource=${loki_datasource}&var-cluster=${cluster}&var-namespace=${namespace}"
}
]
}
]
}
]
},
"gridPos": {
"h": 9,
"w": 16,
"x": 8,
"y": 0
},
"id": 2,
"options": {
"cellHeight": "sm",
"footer": {
"countRows": false,
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "cluster_node_info{cluster=\"$cluster\", namespace=\"$namespace\"}",
"format": "table",
"instant": true,
"legendFormat": "__auto",
"range": false,
"refId": "A"
}
],
"title": "Node table",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value": false,
"__name__": true,
"cluster": true,
"namespace": true,
"state": false
},
"indexByName": {},
"renameByName": {
"Value": "Dashboard",
"instance": "",
"state": ""
}
}
}
],
"type": "table"
},
{
"datasource": "${datasource}",
"description": "Whether the cluster state has converged.\n\nIt is normal for the cluster state to be diverged briefly as gossip events propagate. It is not normal for the cluster state to be diverged for a long period of time.\n\nThis will show one of the following:\n\n* Converged: Nodes are aware of all other nodes, with the correct states.\n* Not converged: A subset of nodes aren't aware of their peers, or don't have an updated view of peer states.\n",
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"1": {
"color": "red",
"index": 1,
"text": "Not converged"
}
},
"type": "value"
},
{
"options": {
"match": "null",
"result": {
"color": "green",
"index": 0,
"text": "Converged"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "suffix:nodes"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 8,
"x": 0,
"y": 9
},
"id": 3,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "clamp((\n sum(stddev by (state) (cluster_node_peers{cluster=\"$cluster\", namespace=\"$namespace\"}) != 0) or\n (sum(abs(sum without (state) (cluster_node_peers{cluster=\"$cluster\", namespace=\"$namespace\"})) - scalar(count(cluster_node_info{cluster=\"$cluster\", namespace=\"$namespace\"})) != 0))\n ),\n 1, 1\n)\n",
"format": "time_series",
"instant": true,
"legendFormat": "__auto",
"range": false,
"refId": "A"
}
],
"title": "Convergance state",
"type": "stat"
},
{
"datasource": "${datasource}",
"fieldConfig": {
"defaults": {
"color": {
"mode": "continuous-GrYlRd"
},
"custom": {
"fillOpacity": 80,
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineWidth": 0,
"spanNulls": true
},
"mappings": [
{
"options": {
"0": {
"color": "green",
"text": "Yes"
}
},
"type": "value"
},
{
"options": {
"1": {
"color": "red",
"text": "No"
}
},
"type": "value"
}
],
"max": 1,
"noValue": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 16,
"x": 8,
"y": 9
},
"id": 4,
"options": {
"alignValue": "left",
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"mergeValues": true,
"rowHeight": 0.9,
"showValue": "auto",
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "ceil(clamp((\n sum(stddev by (state) (cluster_node_peers{cluster=\"$cluster\", namespace=\"$namespace\"})) or\n (sum(abs(sum without (state) (cluster_node_peers{cluster=\"$cluster\", namespace=\"$namespace\"})) - scalar(count(cluster_node_info{cluster=\"$cluster\", namespace=\"$namespace\"}))))\n ),\n 0, 1\n))\n",
"instant": false,
"legendFormat": "Converged",
"range": true,
"refId": "A"
}
],
"title": "Convergance state timeline",
"type": "state-timeline"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"alloy-mixin"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Mimir",
"value": "mimir_ds"
},
"hide": 0,
"includeAll": false,
"label": "Data Source",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "datasource"
},
{
"current": {
"selected": false,
"text": "Loki",
"value": "loki_ds"
},
"hide": 0,
"includeAll": false,
"label": "Loki Data Source",
"multi": false,
"name": "loki_datasource",
"options": [],
"query": "loki",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "datasource"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "cluster",
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components, cluster)\n",
"refId": "cluster"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n",
"refId": "namespace"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d",
"90d"
]
},
"timezone": "",
"title": "Alloy / Cluster Overview",
"uid": "",
"version": 0,
"weekStart": ""
}

View File

@ -1,970 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
},
{
"datasource": "$loki_datasource",
"enable": true,
"expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"",
"iconColor": "rgba(0, 211, 255, 1)",
"instant": false,
"name": "Deployments",
"titleFormat": "{{cluster}}/{{namespace}}"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 28,
"links": [
{
"icon": "doc",
"targetBlank": true,
"title": "Documentation",
"tooltip": "Component controller documentation",
"type": "link",
"url": "https://grafana.com/docs/alloy/latest/concepts/component_controller/"
},
{
"asDropdown": true,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"alloy-mixin"
],
"targetBlank": false,
"title": "Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"datasource": "${datasource}",
"description": "The number of Alloy instances whose metrics are being sent and reported.\n",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "instances"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 10,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "count(alloy_component_controller_evaluating{cluster=\"$cluster\", namespace=\"$namespace\"})",
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Running instances",
"type": "stat"
},
{
"datasource": "${datasource}",
"description": "Breakdown of components by health across all running instances.\n\n* Healthy: components have been evaluated completely and are reporting themselves as healthy.\n* Unhealthy: Components either could not be evaluated or are reporting themselves as unhealthy.\n* Unknown: A component has been created but has not yet been started.\n* Exited: A component has exited. It will not return to the running state.\n\nMore information on a component's health state can be retrieved using\nthe Alloy UI.\n\nNote that components may be in a degraded state even if they report\nthemselves as healthy. Use component-specific dashboards and alerts\nto observe detailed information about the behavior of a component.\n",
"fieldConfig": {
"defaults": {
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Unhealthy"
},
"properties": [
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Unknown"
},
"properties": [
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "blue",
"value": 1
}
]
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Exited"
},
"properties": [
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 1
}
]
}
}
]
}
]
},
"gridPos": {
"h": 12,
"w": 14,
"x": 10,
"y": 0
},
"id": 4,
"options": {
"displayMode": "gradient",
"maxVizHeight": 300,
"minVizHeight": 16,
"minVizWidth": 8,
"namePlacement": "auto",
"orientation": "vertical",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showUnfilled": true,
"sizing": "auto",
"valueMode": "color"
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "sum(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\", health_type=\"healthy\"}) or vector(0)",
"instant": true,
"legendFormat": "Healthy",
"range": false,
"refId": "A"
},
{
"datasource": "${datasource}",
"expr": "sum(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\", health_type=\"unhealthy\"}) or vector(0)",
"instant": true,
"legendFormat": "Unhealthy",
"range": false,
"refId": "B"
},
{
"datasource": "${datasource}",
"expr": "sum(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\", health_type=\"unknown\"}) or vector(0)",
"instant": true,
"legendFormat": "Unknown",
"range": false,
"refId": "C"
},
{
"datasource": "${datasource}",
"expr": "sum(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\", health_type=\"exited\"}) or vector(0)",
"instant": true,
"legendFormat": "Exited",
"range": false,
"refId": "D"
}
],
"title": "Components by health",
"type": "bargauge"
},
{
"datasource": "${datasource}",
"description": "The number of running components across all running instances.\n",
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "components"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 10,
"x": 0,
"y": 4
},
"id": 2,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "sum(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"})",
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Running components",
"type": "stat"
},
{
"datasource": "${datasource}",
"description": "The percentage of components which are in a healthy state.\n",
"fieldConfig": {
"defaults": {
"mappings": [],
"max": 1,
"min": 0,
"noValue": "No components",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 10,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"text": {
"valueSize": 80
},
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "sum(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\",health_type=\"healthy\"}) /\nsum(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"})\n",
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Overall component health",
"type": "stat"
},
{
"datasource": "${datasource}",
"description": "The frequency at which components get updated.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "points",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 3,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 0,
"y": 12
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "sum by (instance) (rate(alloy_component_evaluation_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Component evaluation rate",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "The percentiles for how long it takes to complete component evaluations.\n\nComponent evaluations must complete for components to have the latest\narguments. The longer the evaluations take, the slower it will be to\nreconcile the state of components.\n\nIf evaluation is taking too long, consider sharding your components to\ndeal with smaller amounts of data and reuse data as much as possible.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 8,
"y": 12
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))\nor\nhistogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))\n",
"instant": false,
"legendFormat": "99th percentile",
"range": true,
"refId": "A"
},
{
"datasource": "${datasource}",
"expr": "histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))\nor\nhistogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))\n",
"instant": false,
"legendFormat": "50th percentile",
"range": true,
"refId": "B"
},
{
"datasource": "${datasource}",
"expr": "(\n histogram_sum(sum(rate(alloy_component_evaluation_seconds{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval]))) /\n histogram_count(sum(rate(alloy_component_evaluation_seconds{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))\n)\nor\n(\n sum(rate(alloy_component_evaluation_seconds_sum{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])) /\n sum(rate(alloy_component_evaluation_seconds_count{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval]))\n)\n",
"instant": false,
"legendFormat": "Average",
"range": true,
"refId": "C"
}
],
"title": "Component evaluation time",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate.\n\nIdeally, no component should take more than 1 minute to evaluate. The components displayed in this chart\nmay be a sign of a problem with the pipeline.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 16,
"y": 12
},
"id": 7,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))\n/ scalar(sum(rate(alloy_component_evaluation_seconds_sum{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))\n",
"instant": false,
"legendFormat": "{{component path}} {{component_id}}",
"range": true,
"refId": "A"
}
],
"title": "Slow components evaluation times",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Detailed histogram view of how long component evaluations take.\n\nThe goal is to design your config so that evaluations take as little\ntime as possible; under 100ms is a good goal.\n",
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 0,
"y": 22
},
"id": 8,
"maxDataPoints": 30,
"options": {
"calculate": false,
"cellGap": 0,
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 0.1
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": true
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "sum(increase(alloy_component_evaluation_seconds{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))\nor ignoring (le)\nsum by (le) (increase(alloy_component_evaluation_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))\n",
"format": "heatmap",
"instant": false,
"legendFormat": "{{le}}",
"range": true,
"refId": "A"
}
],
"title": "Component evaluation histogram",
"type": "heatmap"
},
{
"datasource": "${datasource}",
"description": "Detailed histogram of how long components wait to be evaluated after their dependency is updated.\n\nThe goal is to design your config so that most of the time components do not\nqueue for long; under 10ms is a good goal.\n",
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 8,
"y": 22
},
"id": 9,
"maxDataPoints": 30,
"options": {
"calculate": false,
"cellGap": 0,
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 0.1
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": true
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "sum(increase(alloy_component_dependencies_wait_seconds{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))\nor ignoring (le)\nsum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))\n",
"format": "heatmap",
"instant": false,
"legendFormat": "{{le}}",
"range": true,
"refId": "A"
}
],
"title": "Component dependency wait histogram",
"type": "heatmap"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"alloy-mixin"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Mimir",
"value": "mimir_ds"
},
"hide": 0,
"includeAll": false,
"label": "Data Source",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "datasource"
},
{
"current": {
"selected": false,
"text": "Loki",
"value": "loki_ds"
},
"hide": 0,
"includeAll": false,
"label": "Loki Data Source",
"multi": false,
"name": "loki_datasource",
"options": [],
"query": "loki",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "datasource"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "cluster",
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components, cluster)\n",
"refId": "cluster"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n",
"refId": "namespace"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d",
"90d"
]
},
"timezone": "",
"title": "Alloy / Controller",
"uid": "bf9f456aad7108b2c808dbd9973e386f",
"version": 0,
"weekStart": ""
}

View File

@ -1,923 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 25,
"links": [
{
"asDropdown": true,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"alloy-mixin"
],
"targetBlank": false,
"title": "Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"datasource": "${datasource}",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Receivers for traces [otelcol.receiver]",
"type": "row"
},
{
"datasource": "${datasource}",
"description": "Number of spans successfully pushed into the pipeline.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "hue",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(receiver_accepted_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }} / {{ transport }}",
"range": true,
"refId": "A"
}
],
"title": "Accepted spans",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Number of spans that could not be pushed into the pipeline.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "hue",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 8,
"y": 1
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(receiver_refused_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }} / {{ transport }}",
"range": true,
"refId": "A"
}
],
"title": "Refused spans",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "The duration of inbound RPCs.\n",
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 16,
"y": 1
},
"id": 4,
"maxDataPoints": 30,
"options": {
"calculate": false,
"cellGap": 1,
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Oranges",
"steps": 65
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": true
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "ms"
}
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "sum by (le) (increase(rpc_server_duration_milliseconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", rpc_service=\"opentelemetry.proto.collector.trace.v1.TraceService\"}[$__rate_interval]))",
"format": "heatmap",
"instant": false,
"legendFormat": "{{le}}",
"range": true,
"refId": "A"
}
],
"title": "RPC server duration",
"type": "heatmap"
},
{
"datasource": "${datasource}",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 11
},
"id": 5,
"title": "Batching of logs, metrics, and traces [otelcol.processor.batch]",
"type": "row"
},
{
"datasource": "${datasource}",
"description": "Number of spans, metric datapoints, or log lines in a batch\n",
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 0,
"y": 12
},
"id": 6,
"maxDataPoints": 30,
"options": {
"calculate": false,
"cellGap": 1,
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Oranges",
"steps": 65
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": true
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "short"
}
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": "${datasource}",
"expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))",
"format": "heatmap",
"instant": false,
"legendFormat": "{{le}}",
"range": true,
"refId": "A"
}
],
"title": "Number of units in the batch",
"type": "heatmap"
},
{
"datasource": "${datasource}",
"description": "Number of distinct metadata value combinations being processed\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 8,
"y": 12
},
"id": 7,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "processor_batch_metadata_cardinality_ratio{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}\n",
"instant": false,
"legendFormat": "{{ pod }}",
"range": true,
"refId": "A"
}
],
"title": "Distinct metadata values",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Number of times the batch was sent due to a timeout trigger\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 16,
"y": 12
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }}",
"range": true,
"refId": "A"
}
],
"title": "Timeout trigger",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 22
},
"id": 9,
"title": "Exporters for traces [otelcol.exporter]",
"type": "row"
},
{
"datasource": "${datasource}",
"description": "Number of spans successfully sent to destination.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "hue",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 0,
"y": 23
},
"id": 10,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(exporter_sent_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }}",
"range": true,
"refId": "A"
}
],
"title": "Exported sent spans",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Number of spans in failed attempts to send to destination.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "hue",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 8,
"x": 8,
"y": 23
},
"id": 11,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(exporter_send_failed_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }}",
"range": true,
"refId": "A"
}
],
"title": "Exported failed spans",
"type": "timeseries"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"alloy-mixin"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Mimir",
"value": "mimir_ds"
},
"hide": 0,
"includeAll": false,
"label": "Data Source",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "datasource"
},
{
"current": {
"selected": false,
"text": "Loki",
"value": "loki_ds"
},
"hide": 0,
"includeAll": false,
"label": "Loki Data Source",
"multi": false,
"name": "loki_datasource",
"options": [],
"query": "loki",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "datasource"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "cluster",
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components, cluster)\n",
"refId": "cluster"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n",
"refId": "namespace"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": true,
"label": "instance",
"multi": true,
"name": "instance",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"}, instance)\n",
"refId": "instance"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d",
"90d"
]
},
"timezone": "",
"title": "Alloy / OpenTelemetry",
"uid": "9b6d37c8603e19e8922133984faad93d",
"version": 0,
"weekStart": ""
}

File diff suppressed because it is too large Load Diff

View File

@ -1,840 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
},
{
"datasource": "$loki_datasource",
"enable": true,
"expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"",
"iconColor": "rgba(0, 211, 255, 1)",
"instant": false,
"name": "Deployments",
"titleFormat": "{{cluster}}/{{namespace}}"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 26,
"links": [
{
"asDropdown": true,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"alloy-mixin"
],
"targetBlank": false,
"title": "Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"datasource": "${datasource}",
"description": "CPU usage of the Alloy process relative to 1 CPU core.\n\nFor example, 100% means using one entire CPU core.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(alloy_resources_process_cpu_seconds_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])",
"instant": false,
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "CPU usage",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Resident memory size of the Alloy process.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "alloy_resources_process_resident_memory_bytes{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}",
"instant": false,
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Memory (RSS)",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Rate at which the Alloy process performs garbage collections.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "points",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 3,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(go_gc_duration_seconds_count{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[5m])\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n",
"instant": false,
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Garbage collections",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Number of goroutines which are running in parallel. An infinitely\ngrowing number of these indicates a goroutine leak.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 8
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "go_goroutines{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n",
"instant": false,
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Goroutines",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Heap memory currently in use by the Alloy process.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 8
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n",
"instant": false,
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Memory (heap inuse)",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Rate of data received across all network interfaces for the machine\nAlloy is running on.\n\nData shown here is across all running processes and not exclusive to\nthe running Alloy process.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 30,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(alloy_resources_machine_rx_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Network receive bandwidth",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Rate of data sent across all network interfaces for the machine\nAlloy is running on.\n\nData shown here is across all running processes and not exclusive to\nthe running Alloy process.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 30,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 7,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(alloy_resources_machine_tx_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Network send bandwidth",
"type": "timeseries"
}
],
"refresh": "10s",
"schemaVersion": 39,
"tags": [
"alloy-mixin"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Mimir",
"value": "mimir_ds"
},
"hide": 0,
"includeAll": false,
"label": "Data Source",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "datasource"
},
{
"current": {
"selected": false,
"text": "Loki",
"value": "loki_ds"
},
"hide": 0,
"includeAll": false,
"label": "Loki Data Source",
"multi": false,
"name": "loki_datasource",
"options": [],
"query": "loki",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "datasource"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "cluster",
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components, cluster)\n",
"refId": "cluster"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n",
"refId": "namespace"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": {
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": true,
"label": "instance",
"multi": true,
"name": "instance",
"options": [],
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"}, instance)\n",
"refId": "instance"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d",
"90d"
]
},
"timezone": "",
"title": "Alloy / Resources",
"uid": "d6a8574c31f3d7cb8f1345ec84d15a67",
"version": 0,
"weekStart": ""
}

View File

@ -51,7 +51,6 @@
"overrides": [ ]
},
"id": 1,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -99,7 +98,6 @@
"overrides": [ ]
},
"id": 2,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -159,7 +157,6 @@
"overrides": [ ]
},
"id": 3,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -238,7 +235,6 @@
"overrides": [ ]
},
"id": 4,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -329,7 +325,6 @@
"overrides": [ ]
},
"id": 5,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -408,7 +403,6 @@
"overrides": [ ]
},
"id": 6,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -468,7 +462,6 @@
"overrides": [ ]
},
"id": 7,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -665,7 +658,6 @@
},
"fill": 10,
"id": 8,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -727,7 +719,6 @@
"overrides": [ ]
},
"id": 9,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -775,7 +766,6 @@
"overrides": [ ]
},
"id": 10,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -847,7 +837,6 @@
"hideZeroBuckets": false,
"highlightCards": true,
"id": 11,
"interval": "1m",
"legend": {
"show": true
},
@ -910,7 +899,6 @@
"hideZeroBuckets": false,
"highlightCards": true,
"id": 12,
"interval": "1m",
"legend": {
"show": true
},
@ -980,7 +968,6 @@
"overrides": [ ]
},
"id": 13,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1052,7 +1039,6 @@
"overrides": [ ]
},
"id": 14,
"interval": "1m",
"links": [ ],
"options": {
"legend": {

View File

@ -35,7 +35,6 @@
"fill": 1,
"format": "none",
"id": 1,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -111,7 +110,6 @@
"fill": 1,
"format": "dtdurations",
"id": 2,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -215,7 +213,6 @@
"overrides": [ ]
},
"id": 3,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -263,7 +260,6 @@
"overrides": [ ]
},
"id": 4,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -311,7 +307,6 @@
"overrides": [ ]
},
"id": 5,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -371,7 +366,6 @@
"overrides": [ ]
},
"id": 6,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -419,7 +413,6 @@
"overrides": [ ]
},
"id": 7,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -467,7 +460,6 @@
"overrides": [ ]
},
"id": 8,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -527,7 +519,6 @@
"overrides": [ ]
},
"id": 9,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -575,7 +566,6 @@
"overrides": [ ]
},
"id": 10,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -613,7 +603,6 @@
{
"datasource": "$loki_datasource",
"id": 11,
"interval": "1m",
"span": 6,
"targets": [
{
@ -627,7 +616,6 @@
{
"datasource": "$loki_datasource",
"id": 12,
"interval": "1m",
"span": 6,
"targets": [
{

View File

@ -38,7 +38,6 @@
},
"hiddenSeries": false,
"id": 35,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -130,7 +129,6 @@
},
"hiddenSeries": false,
"id": 41,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -218,7 +216,6 @@
},
"hiddenSeries": false,
"id": 36,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -310,7 +307,6 @@
},
"hiddenSeries": false,
"id": 40,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -402,7 +398,6 @@
},
"hiddenSeries": false,
"id": 38,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -494,7 +489,6 @@
},
"hiddenSeries": false,
"id": 39,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -581,7 +575,6 @@
},
"hiddenSeries": false,
"id": 37,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -674,7 +667,6 @@
},
"hiddenSeries": false,
"id": 42,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -767,7 +759,6 @@
},
"hiddenSeries": false,
"id": 31,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -866,7 +857,6 @@
"y": 6
},
"id": 29,
"interval": "1m",
"maxDataPoints": "",
"options": {
"showLabels": false,

View File

@ -32,7 +32,6 @@
"y": 0
},
"id": 17,
"interval": "1m",
"panels": [ ],
"targets": [ ],
"title": "Main",
@ -62,7 +61,6 @@
},
"hiddenSeries": false,
"id": 6,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -159,7 +157,6 @@
},
"hiddenSeries": false,
"id": 7,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -254,7 +251,7 @@
},
"hiddenSeries": false,
"id": 2,
"interval": "1m",
"interval": "",
"legend": {
"avg": false,
"current": false,
@ -350,7 +347,6 @@
},
"hiddenSeries": false,
"id": 4,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -447,7 +443,6 @@
},
"hiddenSeries": false,
"id": 24,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -545,7 +540,6 @@
},
"hiddenSeries": false,
"id": 9,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -653,7 +647,6 @@
},
"hiddenSeries": false,
"id": 12,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -760,7 +753,6 @@
},
"hiddenSeries": false,
"id": 71,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -860,7 +852,6 @@
},
"hiddenSeries": false,
"id": 13,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -969,7 +960,6 @@
},
"hiddenSeries": false,
"id": 72,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -1069,7 +1059,6 @@
},
"hiddenSeries": false,
"id": 10,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -1179,7 +1168,6 @@
},
"hiddenSeries": false,
"id": 14,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -1286,7 +1274,6 @@
},
"hiddenSeries": false,
"id": 73,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -1387,7 +1374,6 @@
},
"hiddenSeries": false,
"id": 15,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -1494,7 +1480,6 @@
},
"hiddenSeries": false,
"id": 74,
"interval": "1m",
"legend": {
"alignAsTable": true,
"avg": false,
@ -1598,7 +1583,6 @@
},
"hiddenSeries": false,
"id": 112,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -1683,7 +1667,6 @@
"y": 27
},
"id": 113,
"interval": "1m",
"pageSize": null,
"panels": [ ],
"showHeader": true,
@ -1797,7 +1780,6 @@
},
"hiddenSeries": false,
"id": 26,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -1824,7 +1806,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*distributor.*|(loki|enterprise-logs)-write.*|$namespace-[0-9]+)\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*distributor.*|(loki|enterprise-logs)-write)\"}[$__rate_interval]))",
"intervalFactor": 3,
"legendFormat": "{{pod}}",
"refId": "A"
@ -1892,7 +1874,6 @@
},
"hiddenSeries": false,
"id": 27,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -1921,7 +1902,7 @@
"steppedLine": false,
"targets": [
{
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"(.*/.*distributor|$namespace/(loki|enterprise-logs)-write|.*/loki|$namespace/loki-single-binary)\"}",
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"(.*/distributor|(loki|enterprise-logs)-write|.*/loki)\"}",
"instant": false,
"intervalFactor": 3,
"legendFormat": "{{pod}}",
@ -1985,7 +1966,6 @@
},
"hiddenSeries": false,
"id": 31,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2071,7 +2051,6 @@
"y": 32
},
"id": 29,
"interval": "1m",
"options": {
"showLabels": false,
"showTime": false,
@ -2106,7 +2085,6 @@
},
"hiddenSeries": false,
"id": 33,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2202,7 +2180,6 @@
},
"hiddenSeries": false,
"id": 32,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2297,7 +2274,6 @@
},
"hiddenSeries": false,
"id": 34,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2392,7 +2368,6 @@
},
"hiddenSeries": false,
"id": 35,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2498,7 +2473,6 @@
},
"hiddenSeries": false,
"id": 36,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2525,7 +2499,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*ingester.*|(loki|enterprise-logs)-write.*|loki-single-binary|$namespace-[0-9]+)\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*ingester.*|(loki|enterprise-logs)-write|loki-single-binary)\"}[$__rate_interval]))",
"intervalFactor": 3,
"legendFormat": "{{pod}}",
"refId": "A"
@ -2593,7 +2567,6 @@
},
"hiddenSeries": false,
"id": 37,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2622,7 +2595,7 @@
"steppedLine": false,
"targets": [
{
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"(.*ingester.*|(loki|enterprise-logs)-write.*|loki-single-binary|$namespace-[0-9]+)\"}",
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"(.*ingester.*|(loki|enterprise-logs)-write|loki-single-binary)\"}",
"instant": false,
"intervalFactor": 3,
"legendFormat": "{{pod}}",
@ -2686,7 +2659,6 @@
},
"hiddenSeries": false,
"id": 38,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2772,7 +2744,6 @@
"y": 32
},
"id": 39,
"interval": "1m",
"options": {
"showLabels": false,
"showTime": false,
@ -2807,7 +2778,6 @@
},
"hiddenSeries": false,
"id": 67,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -2914,7 +2884,6 @@
},
"hiddenSeries": false,
"id": 106,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -3006,7 +2975,6 @@
},
"hiddenSeries": false,
"id": 108,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -3114,7 +3082,6 @@
},
"hiddenSeries": false,
"id": 102,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -3223,7 +3190,6 @@
"hideZeroBuckets": false,
"highlightCards": true,
"id": 100,
"interval": "1m",
"legend": {
"show": true
},
@ -3281,7 +3247,6 @@
},
"hiddenSeries": false,
"id": 96,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -3308,7 +3273,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(reason) (rate(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\",job=~\"($namespace)/(.*ingester.*|(loki|enterprise-logs)-write|loki-single-binary)\", namespace=~\"$namespace\"}[$__rate_interval])) / ignoring(reason) group_left sum(rate(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\",job=~\"($namespace)/(.*ingester.*|(loki|enterprise-logs)-write|loki-single-binary)\", namespace=~\"$namespace\"}[$__rate_interval]))",
"expr": "sum by(reason) (rate(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\",job=~\"$namespace/.*ingester.*\", namespace=~\"$namespace\"}[$__rate_interval])) / ignoring(reason) group_left sum(rate(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\",job=~\"$namespace/.*ingester.*\", namespace=~\"$namespace\"}[$__rate_interval]))",
"interval": "",
"legendFormat": "{{ reason }}"
}
@ -3380,7 +3345,6 @@
"hideZeroBuckets": true,
"highlightCards": true,
"id": 98,
"interval": "1m",
"legend": {
"show": true
},
@ -3388,7 +3352,7 @@
"reverseYBuckets": false,
"targets": [
{
"expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/(.*ingester|(loki|enterprise-logs)-write|loki-single-binary)\"}[$__rate_interval]))",
"expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/(ingester|(loki|enterprise-logs)-write|loki-single-binary)\"}[$__rate_interval]))",
"format": "heatmap",
"instant": false,
"interval": "",
@ -3454,7 +3418,6 @@
},
"hiddenSeries": false,
"id": 68,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -3481,7 +3444,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*querier.*|(loki|enterprise-logs)-read.*|loki-single-binary|$namespace-[0-9]+)\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"(.*querier.*|(loki|enterprise-logs)-read|loki-single-binary)\"}[$__rate_interval]))",
"intervalFactor": 3,
"legendFormat": "{{pod}}",
"refId": "A"
@ -3549,7 +3512,6 @@
},
"hiddenSeries": false,
"id": 69,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -3578,7 +3540,7 @@
"steppedLine": false,
"targets": [
{
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"(.*querier.*|(loki|enterprise-logs)-read.*|.*loki-single-binary|$namespace-[0-9]+)\"}",
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"(.*querier.*|(loki|enterprise-logs)-read|.*loki-single-binary)\"}",
"instant": false,
"intervalFactor": 3,
"legendFormat": "{{pod}}",
@ -3642,7 +3604,6 @@
},
"hiddenSeries": false,
"id": 65,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -3762,7 +3723,6 @@
},
"hiddenSeries": false,
"id": 70,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -3874,7 +3834,7 @@
},
"hiddenSeries": false,
"id": 53,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -3978,7 +3938,7 @@
},
"hiddenSeries": false,
"id": 54,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4091,7 +4051,7 @@
},
"hiddenSeries": false,
"id": 55,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4200,7 +4160,7 @@
},
"hiddenSeries": false,
"id": 58,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4313,7 +4273,7 @@
},
"hiddenSeries": false,
"id": 61,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4417,7 +4377,7 @@
},
"hiddenSeries": false,
"id": 62,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4529,7 +4489,7 @@
"y": 10
},
"id": 79,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4632,7 +4592,7 @@
"y": 18
},
"id": 80,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4744,7 +4704,7 @@
"y": 10
},
"id": 79,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4847,7 +4807,7 @@
"y": 18
},
"id": 80,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -4959,7 +4919,7 @@
"y": 10
},
"id": 115,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,
@ -5062,7 +5022,7 @@
"y": 18
},
"id": 116,
"interval": "1m",
"interval": "",
"legend": {
"alignAsTable": true,
"avg": false,

View File

@ -90,7 +90,6 @@
]
},
"id": 1,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -104,19 +103,19 @@
"span": 4,
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend\"}[$__rate_interval]))",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", resource=\"cpu\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend\", resource=\"cpu\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"})",
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend\"})",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -192,7 +191,6 @@
]
},
"id": 2,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -206,19 +204,19 @@
"span": 4,
"targets": [
{
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"})",
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", resource=\"memory\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend\", resource=\"memory\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"} > 0)",
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend\"} > 0)",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -255,7 +253,6 @@
"overrides": [ ]
},
"id": 3,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -269,7 +266,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(.*query-frontend|loki-read|loki-single-binary)\"})",
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
@ -357,7 +354,6 @@
]
},
"id": 4,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -371,19 +367,19 @@
"span": 4,
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler|loki\", pod=~\"query-scheduler|loki-read-.*|$namespace-[0-9]*\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler\"}[$__rate_interval]))",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler|loki\", pod=~\"query-scheduler|loki-read-.*|$namespace-[0-9]*\", resource=\"cpu\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler\", resource=\"cpu\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler|loki\", pod=~\"query-scheduler|loki-read-.*|$namespace-[0-9]*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler|loki\", pod=~\"query-scheduler|loki-read-.*|$namespace-[0-9]*\"})",
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler\"})",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -459,7 +455,6 @@
]
},
"id": 5,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -473,19 +468,19 @@
"span": 4,
"targets": [
{
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler|loki\", pod=~\"query-scheduler|loki-read-.*|$namespace-[0-9]*\"})",
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler|loki\", pod=~\"query-scheduler|loki-read-.*|$namespace-[0-9]*\", resource=\"memory\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler\", resource=\"memory\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler|loki\", pod=~\"query-scheduler|loki-read-.*|$namespace-[0-9]*\"} > 0)",
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-scheduler\"} > 0)",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -522,7 +517,6 @@
"overrides": [ ]
},
"id": 6,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -536,7 +530,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(.*query-scheduler|loki-read|loki-single-binary)\"})",
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*query-scheduler\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
@ -625,7 +619,6 @@
},
"gridPos": { },
"id": 7,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -638,19 +631,19 @@
},
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier\"}[$__rate_interval]))",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", resource=\"cpu\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier\", resource=\"cpu\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"})",
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier\"})",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -727,7 +720,6 @@
},
"gridPos": { },
"id": 8,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -740,19 +732,19 @@
},
"targets": [
{
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"})",
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", resource=\"memory\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier\", resource=\"memory\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"} > 0)",
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"querier\"} > 0)",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -790,7 +782,6 @@
},
"gridPos": { },
"id": 9,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -803,7 +794,7 @@
},
"targets": [
{
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(.*querier|loki-read|loki-single-binary)\"})",
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
@ -841,7 +832,6 @@
},
"gridPos": { },
"id": 10,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -854,7 +844,7 @@
},
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -889,7 +879,6 @@
},
"gridPos": { },
"id": 11,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -902,7 +891,7 @@
},
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"query-frontend|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -937,7 +926,6 @@
},
"gridPos": { },
"id": 12,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1037,7 +1025,6 @@
},
"gridPos": { },
"id": 13,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1139,7 +1126,6 @@
},
"gridPos": { },
"id": 14,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1202,7 +1188,6 @@
},
"gridPos": { },
"id": 15,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1253,7 +1238,6 @@
},
"gridPos": { },
"id": 16,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1301,7 +1285,6 @@
},
"gridPos": { },
"id": 17,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1349,7 +1332,6 @@
},
"gridPos": { },
"id": 18,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1449,7 +1431,6 @@
},
"gridPos": { },
"id": 19,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1462,19 +1443,19 @@
},
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"}[$__rate_interval]))",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", resource=\"cpu\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\", resource=\"cpu\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"})",
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"})",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -1551,7 +1532,6 @@
},
"gridPos": { },
"id": 20,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1564,19 +1544,19 @@
},
"targets": [
{
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"})",
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", resource=\"memory\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\", resource=\"memory\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\"} > 0)",
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"} > 0)",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -1614,7 +1594,6 @@
},
"gridPos": { },
"id": 21,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1627,7 +1606,7 @@
},
"targets": [
{
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(.*bloom-gateway|loki-read|loki-single-binary)\"})",
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*bloom-gateway\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
@ -1665,7 +1644,6 @@
},
"gridPos": { },
"id": 22,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1678,7 +1656,7 @@
},
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -1713,7 +1691,6 @@
},
"gridPos": { },
"id": 23,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1726,7 +1703,7 @@
},
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway|loki\", pod=~\"query-frontend|loki-read-.*|$namespace-[0-9]*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -1761,7 +1738,6 @@
},
"gridPos": { },
"id": 24,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1860,7 +1836,6 @@
]
},
"id": 25,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1962,7 +1937,6 @@
]
},
"id": 26,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -2025,7 +1999,6 @@
"overrides": [ ]
},
"id": 27,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -2089,7 +2062,6 @@
},
"gridPos": { },
"id": 28,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -2176,7 +2148,6 @@
},
"gridPos": { },
"id": 29,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -2189,19 +2160,19 @@
},
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler|loki\", pod=~\"ruler|loki-backend-.*|$namespace-[0-9]*\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"}[$__rate_interval]))",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler|loki\", pod=~\"ruler|loki-backend-.*|$namespace-[0-9]*\", resource=\"cpu\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\", resource=\"cpu\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler|loki\", pod=~\"ruler|loki-backend-.*|$namespace-[0-9]*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler|loki\", pod=~\"ruler|loki-backend-.*|$namespace-[0-9]*\"})",
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"})",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -2278,7 +2249,6 @@
},
"gridPos": { },
"id": 30,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -2291,19 +2261,19 @@
},
"targets": [
{
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler|loki\", pod=~\"ruler|loki-backend-.*|$namespace-[0-9]*\"})",
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
},
{
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler|loki\", pod=~\"ruler|loki-backend-.*|$namespace-[0-9]*\", resource=\"memory\"} > 0)",
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\", resource=\"memory\"} > 0)",
"format": "time_series",
"legendFormat": "request",
"legendLink": null
},
{
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler|loki\", pod=~\"ruler|loki-backend-.*|$namespace-[0-9]*\"} > 0)",
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"} > 0)",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -2341,7 +2311,6 @@
},
"gridPos": { },
"id": 31,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -2354,7 +2323,7 @@
},
"targets": [
{
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(.*ruler|loki-backend|loki-single-binary)\"})",
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ruler\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null

View File

@ -200,7 +200,6 @@
},
"fill": 10,
"id": 1,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -250,7 +249,6 @@
"overrides": [ ]
},
"id": 2,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -335,7 +333,6 @@
"overrides": [ ]
},
"id": 3,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -548,7 +545,6 @@
},
"fill": 10,
"id": 4,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -598,7 +594,6 @@
"overrides": [ ]
},
"id": 5,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -683,7 +678,6 @@
"overrides": [ ]
},
"id": 6,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -896,7 +890,6 @@
},
"fill": 10,
"id": 7,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -946,7 +939,6 @@
"overrides": [ ]
},
"id": 8,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1031,7 +1023,6 @@
"overrides": [ ]
},
"id": 9,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1244,7 +1235,6 @@
},
"fill": 10,
"id": 10,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -1294,7 +1284,6 @@
"overrides": [ ]
},
"id": 11,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1379,7 +1368,6 @@
"overrides": [ ]
},
"id": 12,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1592,7 +1580,6 @@
},
"fill": 10,
"id": 13,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -1642,7 +1629,6 @@
"overrides": [ ]
},
"id": 14,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1727,7 +1713,6 @@
"overrides": [ ]
},
"id": 15,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1940,7 +1925,6 @@
},
"fill": 10,
"id": 16,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -1990,7 +1974,6 @@
"overrides": [ ]
},
"id": 17,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -2075,7 +2058,6 @@
"overrides": [ ]
},
"id": 18,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -2288,7 +2270,6 @@
},
"fill": 10,
"id": 19,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -2338,7 +2319,6 @@
"overrides": [ ]
},
"id": 20,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -2417,7 +2397,6 @@
"overrides": [ ]
},
"id": 21,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -2449,7 +2428,7 @@
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "TSDB Index",
"title": "TSBD Index",
"titleSize": "h6"
},
{
@ -2630,7 +2609,6 @@
},
"fill": 10,
"id": 22,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -2680,7 +2658,6 @@
"overrides": [ ]
},
"id": 23,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -2759,7 +2736,6 @@
"overrides": [ ]
},
"id": 24,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {

View File

@ -90,7 +90,6 @@
]
},
"id": 1,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -192,7 +191,6 @@
]
},
"id": 2,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -255,7 +253,6 @@
"overrides": [ ]
},
"id": 3,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -269,7 +266,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(.*compactor|(loki|enterprise-logs)-backend.*|loki-single-binary)\"})",
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/\"(.*compactor|(loki|enterprise-logs)-backend.*|loki-single-binary)\"\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
@ -320,7 +317,6 @@
},
"fill": 1,
"id": 4,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -427,7 +423,6 @@
"overrides": [ ]
},
"id": 5,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -487,7 +482,6 @@
"overrides": [ ]
},
"id": 6,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -535,7 +529,6 @@
"overrides": [ ]
},
"id": 7,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -597,7 +590,6 @@
},
"fill": 1,
"id": 8,
"interval": "1m",
"legend": {
"avg": false,
"current": false,
@ -704,7 +696,6 @@
"overrides": [ ]
},
"id": 9,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -752,7 +743,6 @@
"overrides": [ ]
},
"id": 10,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -812,7 +802,6 @@
"overrides": [ ]
},
"id": 11,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -860,7 +849,6 @@
"overrides": [ ]
},
"id": 12,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -908,7 +896,6 @@
"overrides": [ ]
},
"id": 13,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -969,7 +956,6 @@
},
"format": "short",
"id": 14,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1018,7 +1004,6 @@
"overrides": [ ]
},
"id": 15,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1110,7 +1095,6 @@
},
"format": "short",
"id": 16,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1159,7 +1143,6 @@
"overrides": [ ]
},
"id": 17,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1250,7 +1233,6 @@
"overrides": [ ]
},
"id": 18,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1298,7 +1280,6 @@
"overrides": [ ]
},
"id": 19,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -1346,7 +1327,6 @@
"overrides": [ ]
},
"id": 20,
"interval": "1m",
"links": [ ],
"options": {
"legend": {

View File

@ -90,7 +90,6 @@
]
},
"id": 1,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -104,7 +103,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor|loki\", pod=~\"distributor|loki-write-.*|$namespace-[0-9]*\"}[$__rate_interval]))",
"expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor\"}[$__rate_interval]))",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
@ -116,7 +115,7 @@
"legendLink": null
},
{
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor|loki\", pod=~\"distributor|loki-write-.*|$namespace-[0-9]*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor|loki\", pod=~\"distributor|loki-write-.*|$namespace-[0-9]*\"})",
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor\"})",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -192,7 +191,6 @@
]
},
"id": 2,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -206,7 +204,7 @@
"span": 4,
"targets": [
{
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor|loki\", pod=~\"distributor|loki-write-.*|$namespace-[0-9]*\"})",
"expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
@ -218,7 +216,7 @@
"legendLink": null
},
{
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor|loki\", pod=~\"distributor|loki-write-.*|$namespace-[0-9]*\"} > 0)",
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"distributor\"} > 0)",
"format": "time_series",
"legendFormat": "limit",
"legendLink": null
@ -255,7 +253,6 @@
"overrides": [ ]
},
"id": 3,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -269,7 +266,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(.*distributor|loki-write|loki-single-binary)\"})",
"expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\"})",
"format": "time_series",
"legendFormat": "{{pod}}",
"legendLink": null
@ -319,7 +316,6 @@
},
"gridPos": { },
"id": 4,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -409,7 +405,6 @@
},
"gridPos": { },
"id": 5,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -511,7 +506,6 @@
},
"gridPos": { },
"id": 6,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -574,7 +568,6 @@
},
"gridPos": { },
"id": 7,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -625,7 +618,6 @@
},
"gridPos": { },
"id": 8,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -673,7 +665,6 @@
},
"gridPos": { },
"id": 9,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -721,7 +712,6 @@
},
"gridPos": { },
"id": 10,
"interval": "1m",
"links": [ ],
"options": {
"legend": {

View File

@ -200,7 +200,6 @@
},
"fill": 10,
"id": 1,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -250,7 +249,6 @@
"overrides": [ ]
},
"id": 2,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -347,7 +345,6 @@
"overrides": [ ]
},
"id": 3,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -395,7 +392,6 @@
"overrides": [ ]
},
"id": 4,
"interval": "1m",
"links": [ ],
"options": {
"legend": {
@ -623,7 +619,6 @@
},
"fill": 10,
"id": 5,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -673,7 +668,6 @@
"overrides": [ ]
},
"id": 6,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -919,7 +913,6 @@
},
"fill": 10,
"id": 7,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -969,7 +962,6 @@
"overrides": [ ]
},
"id": 8,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1215,7 +1207,6 @@
},
"fill": 10,
"id": 9,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -1265,7 +1256,6 @@
"overrides": [ ]
},
"id": 10,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {
@ -1505,7 +1495,6 @@
},
"fill": 10,
"id": 11,
"interval": "1m",
"linewidth": 0,
"links": [ ],
"options": {
@ -1555,7 +1544,6 @@
"overrides": [ ]
},
"id": 12,
"interval": "1m",
"links": [ ],
"nullPointMode": "null as zero",
"options": {

View File

@ -1,4 +1,3 @@
groups:
- name: "loki_rules"
rules:
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[5m]))

View File

@ -0,0 +1,555 @@
groups:
- name: "mimir_api_1"
rules:
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[5m]))
by (cluster, job)"
record: "cluster_job:cortex_request_duration_seconds:avg"
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job)"
record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job)"
record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_request_duration_seconds_count:sum_rate"
- name: "mimir_api_2"
rules:
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
by (le, cluster, job, route))"
record: "cluster_job_route:cortex_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
by (le, cluster, job, route))"
record: "cluster_job_route:cortex_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)
/ sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)"
record: "cluster_job_route:cortex_request_duration_seconds:avg"
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, job,
route)"
record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, job, route)"
record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, job, route)"
record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate"
- name: "mimir_api_3"
rules:
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[5m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[5m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace,
job, route) / sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster,
namespace, job, route)"
record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg"
- expr: "sum(rate(cortex_request_duration_seconds_bucket[5m])) by (le, cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_sum[5m])) by (cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_count[5m])) by (cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate"
- name: "mimir_querier_api"
rules:
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_querier_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_querier_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
job) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
job)"
record: "cluster_job:cortex_querier_request_duration_seconds:avg"
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
job)"
record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
job)"
record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
job)"
record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
by (le, cluster, job, route))"
record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
by (le, cluster, job, route))"
record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m])) by
(cluster, job, route)"
record: "cluster_job_route:cortex_querier_request_duration_seconds:avg"
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
job, route)"
record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
job, route)"
record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
job, route)"
record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[5m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[5m]))
by (cluster, namespace, job, route)"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg"
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[5m])) by (le, cluster,
namespace, job, route)"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[5m])) by (cluster,
namespace, job, route)"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[5m])) by (cluster,
namespace, job, route)"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
- name: "mimir_cache"
rules:
- expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m]))
by (le, cluster, job, method))"
record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[5m]))
by (le, cluster, job, method))"
record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster,
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[5m]))
by (cluster, job, method)"
record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg"
- expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[5m])) by (le, cluster,
job, method)"
record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[5m])) by (cluster,
job, method)"
record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_memcache_request_duration_seconds_count[5m])) by (cluster,
job, method)"
record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_cache_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_cache_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)
/ sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_cache_request_duration_seconds:avg"
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster,
job)"
record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job)"
record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
job)"
record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
by (le, cluster, job, method))"
record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[5m]))
by (le, cluster, job, method))"
record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job,
method) / sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
job, method)"
record: "cluster_job_method:cortex_cache_request_duration_seconds:avg"
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[5m])) by (le, cluster,
job, method)"
record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[5m])) by (cluster, job,
method)"
record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[5m])) by (cluster,
job, method)"
record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate"
- name: "mimir_storage"
rules:
- expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_kv_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_kv_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)
/ sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_kv_request_duration_seconds:avg"
- expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[5m])) by (le, cluster,
job)"
record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[5m])) by (cluster, job)"
record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_kv_request_duration_seconds_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate"
- name: "mimir_queries"
rules:
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_query_frontend_retries:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_query_frontend_retries:50quantile"
- expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[5m]))
by (cluster, job)"
record: "cluster_job:cortex_query_frontend_retries:avg"
- expr: "sum(rate(cortex_query_frontend_retries_bucket[5m])) by (le, cluster, job)"
record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate"
- expr: "sum(rate(cortex_query_frontend_retries_sum[5m])) by (cluster, job)"
record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate"
- expr: "sum(rate(cortex_query_frontend_retries_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_query_frontend_retries_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile"
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster,
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by
(cluster, job)"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg"
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[5m])) by (le,
cluster, job)"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[5m])) by (cluster,
job)"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[5m])) by (cluster,
job)"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate"
- name: "mimir_ingester_queries"
rules:
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_series:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_series:50quantile"
- expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[5m]))
by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_series:avg"
- expr: "sum(rate(cortex_ingester_queried_series_bucket[5m])) by (le, cluster, job)"
record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_series_sum[5m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_series_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_series_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_samples:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_samples:50quantile"
- expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[5m]))
by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_samples:avg"
- expr: "sum(rate(cortex_ingester_queried_samples_bucket[5m])) by (le, cluster, job)"
record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_samples_sum[5m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_samples_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_exemplars:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[5m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_exemplars:50quantile"
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job) /
sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_exemplars:avg"
- expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[5m])) by (le, cluster,
job)"
record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[5m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_exemplars_count[5m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate"
- name: "mimir_received_samples"
rules:
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))"
record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m"
- name: "mimir_exemplars_in"
rules:
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))"
record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m"
- name: "mimir_received_exemplars"
rules:
- expr: "sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))"
record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m"
- name: "mimir_exemplars_ingested"
rules:
- expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))"
record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m"
- name: "mimir_exemplars_appended"
rules:
- expr: "sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))"
record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m"
- name: "mimir_scaling_rules"
rules:
- expr: |
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
sum by (cluster, namespace, deployment) (
label_replace(
kube_deployment_spec_replicas,
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
or
sum by (cluster, namespace, deployment) (
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
)
record: "cluster_namespace_deployment:actual_replicas:count"
- expr: |
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
)[24h:]
)
/ 240000
)
labels:
deployment: "distributor"
reason: "sample_rate"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: |
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
* 0.59999999999999998 / 240000
)
labels:
deployment: "distributor"
reason: "sample_rate_limits"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: |
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
)[24h:]
)
* 3 / 80000
)
labels:
deployment: "ingester"
reason: "sample_rate"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: |
ceil(
quantile_over_time(0.99,
sum by(cluster, namespace) (
cortex_ingester_memory_series
)[24h:]
)
/ 1500000
)
labels:
deployment: "ingester"
reason: "active_series"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: |
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
* 3 * 0.59999999999999998 / 1500000
)
labels:
deployment: "ingester"
reason: "active_series_limits"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: |
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
* 0.59999999999999998 / 80000
)
labels:
deployment: "ingester"
reason: "sample_rate_limits"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: |
ceil(
(sum by (cluster, namespace) (
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
) / 4)
/
avg by (cluster, namespace) (
memcached_limit_bytes{job=~".+/memcached"}
)
)
labels:
deployment: "memcached"
reason: "active_series"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: |
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[5m])),
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate"
- expr: |
# Convenience rule to get the CPU request for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
#
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
# where kube_pod_container_resource_requests_cpu_cores was removed:
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests_cpu_cores,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
or
# This expression is compatible with kube-state-metrics >= v1.4.0,
# where kube_pod_container_resource_requests was introduced.
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="cpu"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum"
- expr: |
# Jobs should be sized to their CPU usage.
# We do this by comparing 99th percentile usage over the last 24hrs to
# their current provisioned #replicas and resource requests.
ceil(
cluster_namespace_deployment:actual_replicas:count
*
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
/
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
)
labels:
reason: "cpu_usage"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: |
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
container_memory_usage_bytes{image!=""},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
record: "cluster_namespace_deployment:container_memory_usage_bytes:sum"
- expr: |
# Convenience rule to get the Memory request for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
#
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
# where kube_pod_container_resource_requests_memory_bytes was removed:
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests_memory_bytes,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
or
# This expression is compatible with kube-state-metrics >= v1.4.0,
# where kube_pod_container_resource_requests was introduced.
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="memory"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum"
- expr: |
# Jobs should be sized to their Memory usage.
# We do this by comparing 99th percentile usage over the last 24hrs to
# their current provisioned #replicas and resource requests.
ceil(
cluster_namespace_deployment:actual_replicas:count
*
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
/
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
)
labels:
reason: "memory_usage"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- name: "mimir_alertmanager_rules"
rules:
- expr: "sum by (cluster, job, pod) (cortex_alertmanager_alerts)"
record: "cluster_job_pod:cortex_alertmanager_alerts:sum"
- expr: "sum by (cluster, job, pod) (cortex_alertmanager_silences)"
record: "cluster_job_pod:cortex_alertmanager_silences:sum"
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))"
record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m"
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))"
record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m"
- expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))"
record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m"
- expr: "sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))"
record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m"
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))"
record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m"
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))"
record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m"
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))"
record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m"
- expr: "sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))"
record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m"
- name: "mimir_ingester_rules"
rules:
- expr: "sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[5m]))"
record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m"

View File

@ -0,0 +1,15 @@
groups:
- name: "tempo_rules"
rules:
- expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile"
- expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)"
record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg"
- expr: "sum(rate(tempo_request_duration_seconds_bucket[5m])) by (le, cluster, namespace, job, route)"
record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(tempo_request_duration_seconds_sum[5m])) by (cluster, namespace, job, route)"
record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(tempo_request_duration_seconds_count[5m])) by (cluster, namespace, job, route)"
record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"

View File

@ -9,15 +9,6 @@
{{- define "agent.all_namespaces" -}}
{{- $list := list }}
{{- range .Values.namespacesToMonitor }}
{{- $list = append $list (printf "\"%s\"" .) }}
{{- end }}
{{- $list = append $list (printf "\"%s\"" .Release.Namespace) }}
{{- join ", " $list }}
{{- end }}
{{- define "agent.all_namespaces_bar" -}}
{{- $list := list }}
{{- range .Values.namespacesToMonitor }}
{{- $list = append $list (printf "%s" .) }}
{{- end }}
{{- $list = append $list .Release.Namespace }}
@ -57,32 +48,10 @@
{{- define "agent.tempo_write_targets" -}}
{{- $list := list }}
{{- if .Values.local.traces.enabled }}
{{- $list = append $list ("otelcol.exporter.otlphttp.local.input") }}
{{- $list = append $list ("otelcol.exporter.otlp.local.input") }}
{{- end }}
{{- if .Values.cloud.traces.enabled }}
{{- $list = append $list ("otelcol.exporter.otlphttp.cloud.input") }}
{{- end }}
{{- join ", " $list }}
{{- end }}
{{- define "agent.all_logs" -}}
{{- $list := list }}
{{- range .Values.logs.retain }}
{{- $list = append $list . }}
{{- end }}
{{- range .Values.logs.extraLogs }}
{{- $list = append $list . }}
{{- end }}
{{- join "|" $list }}
{{- end }}
{{- define "agent.all_metrics" -}}
{{- $list := list }}
{{- range .Values.metrics.retain }}
{{- $list = append $list . }}
{{- end }}
{{- range .Values.metrics.extraMetrics }}
{{- $list = append $list . }}
{{- end }}
{{- join "|" $list }}
{{- end }}

View File

@ -59,9 +59,9 @@ data:
loki.process "filter" {
forward_to = [ {{ include "agent.loki_write_targets" . }} ]
{{- if or (not (empty .Values.logs.retain)) (not (empty .Values.logs.extraLogs)) }}
{{- if not (empty .Values.logs.retain) }}
stage.match {
selector = "{cluster=\"{{- .Values.clusterLabelValue -}}\", namespace=~\"{{- join "|" .Values.namespacesToMonitor -}}|{{- $.Release.Namespace -}}\", pod=~\"loki.*\"} !~ \"{{ include "agent.all_logs" . }}\""
selector = "{cluster=\"{{- .Values.clusterLabelValue -}}\", namespace=~\"{{- join "|" .Values.namespacesToMonitor -}}|{{- $.Release.Namespace -}}\", pod=~\"loki.*\"} !~ \"{{ join "|" .Values.logs.retain }}\""
action = "drop"
}
{{- end }}
@ -93,7 +93,7 @@ data:
role = "pod"
namespaces {
own_namespace = true
names = [ {{ include "agent.all_namespaces" . }} ]
names = [ {{ include "agent.namespaces" . }} ]
}
}
@ -120,9 +120,9 @@ data:
replacement = "{{- .Values.clusterLabelValue -}}"
}
rule {
source_labels = ["__meta_kubernetes_pod_container_port_name"]
action = "keep"
regex = ".*metrics.*"
source_labels = ["__meta_kubernetes_pod_container_port_number"]
action = "drop"
regex = "9095"
}
}
@ -135,27 +135,148 @@ data:
}
prometheus.relabel "filter" {
rule {
target_label = "cluster"
replacement = "{{- .Values.clusterLabelValue -}}"
}
rule {
source_labels = ["__name__"]
regex = "({{ include "agent.all_metrics" . }})"
regex = "({{ join "|" .Values.metrics.retain }})"
action = "keep"
}
rule {
source_labels = ["namespace"]
regex = "{{ include "agent.all_namespaces_bar" . }}"
regex = "{{ include "agent.all_namespaces" . }}"
action = "keep"
}
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
}
{{- if .Values.kubeStateMetrics.enabled }}
prometheus.scrape "kubeStateMetrics" {
clustering {
enabled = true
}
targets = [ { "__address__" = "{{ .Values.kubeStateMetrics.endpoint }}" } ]
forward_to = [ prometheus.relabel.filter.receiver ]
}
{{- end }}
// cAdvisor and Kubelet metrics
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
discovery.kubernetes "all_nodes" {
role = "node"
namespaces {
own_namespace = true
names = [ {{ include "agent.namespaces" . }} ]
}
}
discovery.relabel "all_nodes" {
targets = discovery.kubernetes.all_nodes.targets
rule {
source_labels = ["__meta_kubernetes_node_name"]
target_label = "node"
}
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
separator = "/"
regex = "(.*)/(.*)/(.*)"
replacement = "${1}/${2}-${3}"
target_label = "job"
}
rule {
target_label = "cluster"
replacement = "{{- .Values.clusterLabelValue -}}"
}
}
prometheus.scrape "cadvisor" {
clustering {
enabled = true
}
targets = discovery.relabel.all_nodes.output
forward_to = [ prometheus.relabel.filter.receiver ]
metrics_path = "/metrics/cadvisor"
scheme = "https"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
}
}
prometheus.scrape "kubelet" {
clustering {
enabled = true
}
targets = discovery.relabel.all_nodes.output
forward_to = [ prometheus.relabel.filter.receiver ]
metrics_path = "/metrics"
scheme = "https"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
}
}
prometheus.exporter.unix "promexporter" {}
prometheus.scrape "node_exporter" {
clustering {
enabled = true
}
targets = prometheus.exporter.unix.promexporter.targets
forward_to = [prometheus.relabel.node_exporter.receiver]
job_name = "node-exporter"
}
prometheus.relabel "node_exporter" {
forward_to = [ prometheus.relabel.filter.receiver ]
rule {
replacement = env("HOSTNAME")
target_label = "nodename"
}
rule {
replacement = "node-exporter"
target_label = "job"
}
rule {
source_labels = ["__meta_kubernetes_node_name"]
target_label = "node"
}
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
separator = "/"
regex = "(.*)/(.*)/(.*)"
replacement = "${1}/${2}-${3}"
target_label = "job"
}
rule {
target_label = "cluster"
replacement = "{{- .Values.clusterLabelValue -}}"
}
}
{{- end }}
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
@ -173,7 +294,9 @@ data:
// We don't technically need this, but it shows how to change listen address and incoming port.
// In this case, the Agent is listening on all available bindable addresses on port 4317 (which is the
// default OTLP gRPC port) for the OTLP protocol.
grpc {}
grpc {
endpoint = "0.0.0.0:4317"
}
// We define where to send the output of all ingested traces. In this case, to the OpenTelemetry batch processor
// named 'default'.
@ -209,7 +332,7 @@ data:
{{- if .Values.local.logs.enabled }}
loki.write "local" {
endpoint {
url = "http://loki-write.{{- .Release.Namespace -}}.svc.cluster.local:3100/loki/api/v1/push"
url = "http://{{- .Release.Namespace -}}-loki-gateway.{{- .Release.Namespace -}}.svc.cluster.local:80/loki/api/v1/push"
}
}
{{- end }}
@ -222,14 +345,6 @@ data:
}
{{- end }}
{{- if .Values.local.traces.enabled }}
otelcol.exporter.otlphttp "local" {
client {
endpoint = "http://{{- .Release.Name -}}-tempo-distributor.{{- .Release.Namespace -}}.svc:4318"
}
}
{{- end }}
{{- if .Values.cloud.logs.enabled }}
loki.write "cloud" {
endpoint {

View File

@ -0,0 +1,19 @@
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: agent-dashboards-1
namespace: {{ $.Release.Namespace }}
data:
"agent-logs-pipeline.json": |
{{ $.Files.Get "src/dashboards/agent-logs-pipeline.json" | fromJson | toJson }}
"agent-operational.json": |
{{ $.Files.Get "src/dashboards/agent-operational.json" | fromJson | toJson }}
"agent-remote-write.json": |
{{ $.Files.Get "src/dashboards/agent-remote-write.json" | fromJson | toJson }}
"agent-tracing-pipeline.json": |
{{ $.Files.Get "src/dashboards/agent-tracing-pipeline.json" | fromJson | toJson }}
"agent.json": |
{{ $.Files.Get "src/dashboards/agent.json" | fromJson | toJson }}
{{- end }}

View File

@ -1,21 +0,0 @@
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alloy-dashboards-1
namespace: {{ $.Release.Namespace }}
data:
"alloy-cluster-node.json": |
{{ $.Files.Get "src/dashboards/alloy-cluster-node.json" | fromJson | toJson }}
"alloy-cluster-overview.json": |
{{ $.Files.Get "src/dashboards/alloy-cluster-overview.json" | fromJson | toJson }}
"alloy-controller.json": |
{{ $.Files.Get "src/dashboards/alloy-controller.json" | fromJson | toJson }}
"alloy-opentelemetry.json": |
{{ $.Files.Get "src/dashboards/alloy-opentelemetry.json" | fromJson | toJson }}
"alloy-prometheus.json": |
{{ $.Files.Get "src/dashboards/alloy-prometheus.json" | fromJson | toJson }}
"alloy-resources.json": |
{{ $.Files.Get "src/dashboards/alloy-resources.json" | fromJson | toJson }}
{{- end }}

View File

@ -1,4 +1,4 @@
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
---
apiVersion: v1
kind: ConfigMap
@ -28,12 +28,64 @@ data:
orgId: 1
type: file
{{- end }}
{{- if .Values.dashboards.metrics.enabled }}
- disableDeletion: true
editable: false
folder: Alloy
name: alloy-1
folder: Mimir
name: mimir-1
options:
path: /var/lib/grafana/dashboards/alloy-1
path: /var/lib/grafana/dashboards/mimir-1
orgId: 1
type: file
- disableDeletion: true
editable: false
folder: Mimir
name: mimir-2
options:
path: /var/lib/grafana/dashboards/mimir-2
orgId: 1
type: file
- disableDeletion: true
editable: false
folder: Mimir
name: mimir-3
options:
path: /var/lib/grafana/dashboards/mimir-3
orgId: 1
type: file
- disableDeletion: true
editable: false
folder: Mimir
name: mimir-4
options:
path: /var/lib/grafana/dashboards/mimir-4
orgId: 1
type: file
- disableDeletion: true
editable: false
folder: Mimir
name: mimir-5
options:
path: /var/lib/grafana/dashboards/mimir-5
orgId: 1
type: file
{{- end }}
{{- if .Values.dashboards.traces.enabled }}
- disableDeletion: true
editable: false
folder: Tempo
name: tempo-1
options:
path: /var/lib/grafana/dashboards/tempo-1
orgId: 1
type: file
{{- end }}
- disableDeletion: true
editable: false
folder: Agent
name: agent-1
options:
path: /var/lib/grafana/dashboards/agent-1
orgId: 1
type: file
{{- end }}

View File

@ -61,10 +61,6 @@ data:
# <bool> Allows users to edit data sources from the
# Grafana UI.
editable: true
# Extra config.
jsonData:
# Scrape interval
timeInterval: 1m
{{- end }}
{{- if .Values.local.traces.enabled }}
- name: Tempo

View File

@ -1,12 +0,0 @@
{{- if .Values.local.grafana.enabled }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
{{- end }}

View File

@ -1,15 +0,0 @@
{{- if .Values.local.grafana.enabled }}
apiVersion: v1
kind: Service
metadata:
name: grafana
spec:
ports:
- port: 3000
protocol: TCP
targetPort: http-grafana
selector:
app: grafana
sessionAffinity: None
type: ClusterIP # Make this configurable
{{- end }}

View File

@ -1,4 +1,16 @@
{{- if .Values.local.grafana.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
@ -20,7 +32,7 @@ spec:
- 0
containers:
- name: grafana
image: grafana/grafana:{{- .Values.grafana.version }}
image: grafana/grafana:10.0.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
@ -53,7 +65,7 @@ spec:
name: grafana-pv
- mountPath: /etc/grafana/provisioning/datasources
name: datasources-provisioning
{{- if .Values.dashboards.logs.enabled }}
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
- mountPath: /etc/grafana/provisioning/dashboards
name: dashboards-provisioning
{{- end }}
@ -63,8 +75,24 @@ spec:
- mountPath: /var/lib/grafana/dashboards/loki-2
name: loki-dashboards-2
{{- end }}
- mountPath: /var/lib/grafana/dashboards/alloy-1
name: alloy-dashboards-1
{{- if .Values.dashboards.metrics.enabled }}
- mountPath: /var/lib/grafana/dashboards/mimir-1
name: mimir-dashboards-1
- mountPath: /var/lib/grafana/dashboards/mimir-2
name: mimir-dashboards-2
- mountPath: /var/lib/grafana/dashboards/mimir-3
name: mimir-dashboards-3
- mountPath: /var/lib/grafana/dashboards/mimir-4
name: mimir-dashboards-4
- mountPath: /var/lib/grafana/dashboards/mimir-5
name: mimir-dashboards-5
{{- end }}
{{- if .Values.dashboards.traces.enabled }}
- mountPath: /var/lib/grafana/dashboards/tempo-1
name: tempo-dashboards-1
{{- end }}
- mountPath: /var/lib/grafana/dashboards/agent-1
name: agent-dashboards-1
volumes:
- name: grafana-pv
persistentVolumeClaim:
@ -83,7 +111,44 @@ spec:
configMap:
name: loki-dashboards-2
{{- end }}
- name: alloy-dashboards-1
{{- if .Values.dashboards.metrics.enabled }}
- name: mimir-dashboards-1
configMap:
name: alloy-dashboards-1
name: mimir-dashboards-1
- name: mimir-dashboards-2
configMap:
name: mimir-dashboards-2
- name: mimir-dashboards-3
configMap:
name: mimir-dashboards-3
- name: mimir-dashboards-4
configMap:
name: mimir-dashboards-4
- name: mimir-dashboards-5
configMap:
name: mimir-dashboards-5
{{- end }}
{{- if .Values.dashboards.traces.enabled }}
- name: tempo-dashboards-1
configMap:
name: tempo-dashboards-1
{{- end }}
- name: agent-dashboards-1
configMap:
name: agent-dashboards-1
---
apiVersion: v1
kind: Service
metadata:
name: grafana
spec:
ports:
- port: 3000
protocol: TCP
targetPort: http-grafana
selector:
app: grafana
sessionAffinity: None
type: ClusterIP # Make this configurable
{{- end }}

View File

@ -12,6 +12,8 @@ data:
{{ $.Files.Get "src/dashboards/loki-deletion.json" | fromJson | toJson }}
"loki-logs.json": |
{{ $.Files.Get "src/dashboards/loki-logs.json" | fromJson | toJson }}
"loki-mixin-recording-rules.json": |
{{ $.Files.Get "src/dashboards/loki-mixin-recording-rules.json" | fromJson | toJson }}
"loki-operational.json": |
{{ $.Files.Get "src/dashboards/loki-operational.json" | fromJson | toJson }}
{{- end }}

View File

@ -0,0 +1,19 @@
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mimir-dashboards-1
namespace: {{ $.Release.Namespace }}
data:
"mimir-alertmanager-resources.json": |
{{ $.Files.Get "src/dashboards/mimir-alertmanager-resources.json" | fromJson | toJson }}
"mimir-alertmanager.json": |
{{ $.Files.Get "src/dashboards/mimir-alertmanager.json" | fromJson | toJson }}
"mimir-compactor-resources.json": |
{{ $.Files.Get "src/dashboards/mimir-compactor-resources.json" | fromJson | toJson }}
"mimir-compactor.json": |
{{ $.Files.Get "src/dashboards/mimir-compactor.json" | fromJson | toJson }}
"mimir-config.json": |
{{ $.Files.Get "src/dashboards/mimir-config.json" | fromJson | toJson }}
{{- end }}

View File

@ -0,0 +1,19 @@
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mimir-dashboards-2
namespace: {{ $.Release.Namespace }}
data:
"mimir-object-store.json": |
{{ $.Files.Get "src/dashboards/mimir-object-store.json" | fromJson | toJson }}
"mimir-overrides.json": |
{{ $.Files.Get "src/dashboards/mimir-overrides.json" | fromJson | toJson }}
"mimir-overview-networking.json": |
{{ $.Files.Get "src/dashboards/mimir-overview-networking.json" | fromJson | toJson }}
"mimir-overview-resources.json": |
{{ $.Files.Get "src/dashboards/mimir-overview-resources.json" | fromJson | toJson }}
"mimir-overview.json": |
{{ $.Files.Get "src/dashboards/mimir-overview.json" | fromJson | toJson }}
{{- end }}

View File

@ -0,0 +1,19 @@
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mimir-dashboards-3
namespace: {{ $.Release.Namespace }}
data:
"mimir-queries.json": |
{{ $.Files.Get "src/dashboards/mimir-queries.json" | fromJson | toJson }}
"mimir-reads-networking.json": |
{{ $.Files.Get "src/dashboards/mimir-reads-networking.json" | fromJson | toJson }}
"mimir-reads-resources.json": |
{{ $.Files.Get "src/dashboards/mimir-reads-resources.json" | fromJson | toJson }}
"mimir-reads.json": |
{{ $.Files.Get "src/dashboards/mimir-reads.json" | fromJson | toJson }}
"mimir-remote-ruler-reads-resources.json": |
{{ $.Files.Get "src/dashboards/mimir-remote-ruler-reads-resources.json" | fromJson | toJson }}
{{- end }}

View File

@ -0,0 +1,19 @@
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mimir-dashboards-4
namespace: {{ $.Release.Namespace }}
data:
"mimir-remote-ruler-reads.json": |
{{ $.Files.Get "src/dashboards/mimir-remote-ruler-reads.json" | fromJson | toJson }}
"mimir-rollout-progress.json": |
{{ $.Files.Get "src/dashboards/mimir-rollout-progress.json" | fromJson | toJson }}
"mimir-ruler.json": |
{{ $.Files.Get "src/dashboards/mimir-ruler.json" | fromJson | toJson }}
"mimir-scaling.json": |
{{ $.Files.Get "src/dashboards/mimir-scaling.json" | fromJson | toJson }}
"mimir-slow-queries.json": |
{{ $.Files.Get "src/dashboards/mimir-slow-queries.json" | fromJson | toJson }}
{{- end }}

View File

@ -0,0 +1,19 @@
{{- if and .Values.local.grafana.enabled .Values.dashboards.metrics.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mimir-dashboards-5
namespace: {{ $.Release.Namespace }}
data:
"mimir-tenants.json": |
{{ $.Files.Get "src/dashboards/mimir-tenants.json" | fromJson | toJson }}
"mimir-top-tenants.json": |
{{ $.Files.Get "src/dashboards/mimir-top-tenants.json" | fromJson | toJson }}
"mimir-writes-networking.json": |
{{ $.Files.Get "src/dashboards/mimir-writes-networking.json" | fromJson | toJson }}
"mimir-writes-resources.json": |
{{ $.Files.Get "src/dashboards/mimir-writes-resources.json" | fromJson | toJson }}
"mimir-writes.json": |
{{ $.Files.Get "src/dashboards/mimir-writes.json" | fromJson | toJson }}
{{- end }}

View File

@ -0,0 +1,21 @@
{{- if and .Values.local.grafana.enabled .Values.dashboards.traces.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: tempo-dashboards-1
namespace: {{ $.Release.Namespace }}
data:
"tempo-operational.json": |
{{ $.Files.Get "src/dashboards/tempo-operational.json" | fromJson | toJson }}
"tempo-reads.json": |
{{ $.Files.Get "src/dashboards/tempo-reads.json" | fromJson | toJson }}
"tempo-resources.json": |
{{ $.Files.Get "src/dashboards/tempo-resources.json" | fromJson | toJson }}
"tempo-rollout-progress.json": |
{{ $.Files.Get "src/dashboards/tempo-rollout-progress.json" | fromJson | toJson }}
"tempo-tenants.json": |
{{ $.Files.Get "src/dashboards/tempo-tenants.json" | fromJson | toJson }}
"tempo-writes.json": |
{{ $.Files.Get "src/dashboards/tempo-writes.json" | fromJson | toJson }}
{{- end }}

View File

@ -1,5 +1,5 @@
{{- if .Values.local.grafana.enabled }}
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
apiVersion: apps/v1
kind: Deployment
metadata:
@ -49,9 +49,6 @@ spec:
- containerPort: 7946
name: memberlist
protocol: TCP
envFrom:
- secretRef:
name: minio
readinessProbe:
failureThreshold: 3
httpGet:

View File

@ -1,5 +1,5 @@
{{- if .Values.local.metrics.enabled }}
{{- if and .Values.local.grafana.enabled .Values.dashboards.logs.enabled }}
{{- if and .Values.local.grafana.enabled (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled .Values.dashboards.traces.enabled) }}
---
apiVersion: v1
kind: ConfigMap
@ -10,5 +10,11 @@ data:
{{- if .Values.dashboards.logs.enabled }}
{{ ($.Files.Glob "src/rules/loki-rules.yaml").AsConfig | indent 2 }}
{{- end }}
{{- if .Values.dashboards.metrics.enabled }}
{{ ($.Files.Glob "src/rules/mimir-rules.yaml").AsConfig | indent 2 }}
{{- end }}
{{- if .Values.dashboards.traces.enabled }}
{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -34,6 +34,10 @@
{{- end -}}
{{- end -}}
{{- if empty .Values.namespacesToMonitor -}}
{{- fail "No namespaces have been specified in namespacesToMonitor" -}}
{{- end -}}
{{- if empty .Values.metrics.retain -}}
{{- fail "All metrics will be collected, please specify some in metrics.retain" -}}
{{- end -}}

View File

@ -1,9 +1,9 @@
# Specify the namespaces to monitor here
# By default the chart will monitor the namespace it is installed in
# namespacesToMonitor:
# - loki
namespacesToMonitor:
- loki
# The name of the cluster where this will be installed
clusterLabelValue: "meta"
clusterLabelValue: "meta-monitoring"
# Set to true to write logs, metrics or traces to Grafana Cloud
# The secrets have to be created first
cloud:
@ -16,6 +16,7 @@ cloud:
traces:
enabled: true
secret: "traces"
# Set to true for a local version of logs, metrics or traces
local:
grafana:
@ -28,8 +29,8 @@ local:
enabled: false
minio:
enabled: false # This should be set to true if any of the previous is enabled
grafana:
version: 11.4.3
# Gateway ingress configuration
ingress:
# -- Specifies whether an ingress for the gateway should be created
@ -46,83 +47,42 @@ grafana:
paths:
- path: /
# -- pathType (e.g. ImplementationSpecific, Prefix, .. etc.) might also be required by some Ingress Controllers
pathType: Prefix
# backend:
# service:
# name: TODO
# port:
# number: TODO
# pathType: Prefix
# -- TLS configuration for the gateway ingress. Hosts passed through the `tpl` function to allow templating
#tls:
# - secretName: grafana-tls
# hosts:
# - monitoring.example.com
logs:
# Adding regexes here will add a stage.replace block for logs. For more information see
# https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block
piiRegexes: null # This example replaces the word after password with *****
piiRegexes:
# This example replaces the word after password with *****
# - expression: "password (\\\\S+)"
# source: "" # Empty uses the log message
# replace: "*****""
# The lines matching these will be kept in Loki
retain:
# This shows the queries
- executing query
- caller=metrics.go
# This shows any errors
- level=error
- level=warn
# This shows the ingest requests and is very noisy. Uncomment to include.
# - caller=push.go
# Log lines for delete requests
- delete request for user added
- Started processing delete request
- delete request for user marked as processed
# This shows the ingest requests and is very noisy. Uncomment to include.
# - caller=push.go
# Additional log lines to retain
extraLogs: []
metrics:
# The list of metrics to retain for logging dashboards
retain:
- alloy_build_info
- alloy_config_last_load_success_timestamp_seconds
- alloy_config_last_load_successful
- alloy_config_load_failures_total
- alloy_component_controller_evaluating
- alloy_component_dependencies_wait_seconds
- alloy_component_dependencies_wait_seconds_bucket
- alloy_component_evaluation_seconds
- alloy_component_evaluation_seconds_bucket
- alloy_component_evaluation_seconds_count
- alloy_component_evaluation_seconds_sum
- alloy_component_evaluation_slow_seconds
- alloy_component_controller_running_components
- alloy_resources_machine_rx_bytes_total
- alloy_resources_machine_tx_bytes_total
- alloy_resources_process_cpu_seconds_total
- alloy_resources_process_resident_memory_bytes
- prometheus_remote_write_wal_samples_appended_total
- prometheus_remote_write_wal_storage_active_series
- cluster_node_info
- cluster_node_lamport_time
- cluster_node_update_observers
- cluster_node_gossip_health_score
- cluster_node_gossip_proto_version
- cluster_node_gossip_received_events_total
- cluster_node_peers
- cluster_transport_rx_bytes_total
- cluster_transport_rx_packets_total
- cluster_transport_rx_packets_failed_total
- cluster_transport_stream_rx_bytes_total
- cluster_transport_stream_rx_packets_failed_total
- cluster_transport_stream_rx_packets_total
- cluster_transport_stream_tx_bytes_total
- cluster_transport_stream_tx_packets_total
- cluster_transport_stream_tx_packets_failed_total
- cluster_transport_streams
- cluster_transport_tx_packets_total
- cluster_transport_tx_packets_failed_total
- cluster_transport_rx_packet_queue_length
- cluster_transport_tx_packet_queue_length
- agent_config_last_load_success_timestamp_seconds
- agent_config_last_load_successful
- agent_config_load_failures_total
- container_cpu_usage_seconds_total
- container_fs_writes_bytes_total
- container_memory_working_set_bytes
@ -138,10 +98,7 @@ metrics:
- cortex_prometheus_rule_group_last_duration_seconds
- cortex_prometheus_rule_group_last_evaluation_timestamp_seconds
- cortex_prometheus_rule_group_iterations_missed_total
- exporter_send_failed_spans_ratio_total
- exporter_sent_spans_ratio_total
- go_gc_duration_seconds
- go_gc_duration_seconds_count
- go_goroutines
- go_memstats_heap_inuse_bytes
- kubelet_volume_stats_used_bytes
@ -152,7 +109,6 @@ metrics:
- kube_pod_container_resource_requests
- kube_pod_container_status_last_terminated_reason
- kube_pod_container_status_restarts_total
- loki_azure_blob_request_duration_seconds_bucket
- loki_boltdb_shipper_compact_tables_operation_duration_seconds
- loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds
- loki_boltdb_shipper_retention_marker_count_total
@ -178,15 +134,10 @@ metrics:
- loki_compactor_deleted_lines
- loki_compactor_oldest_pending_delete_request_age_seconds
- loki_compactor_pending_delete_requests_count
- loki_consul_request_duration_seconds_bucket
- loki_discarded_samples_total
- loki_discarded_bytes_total
- loki_distributor_bytes_received_total
- loki_distributor_lines_received_total
- loki_distributor_structured_metadata_bytes_received_total
- loki_gcs_request_duration_seconds_bucket
- loki_gcs_request_duration_seconds_count
- loki_index_request_duration_seconds_bucket
- loki_index_request_duration_seconds_count
- loki_ingester_chunk_age_seconds_bucket
- loki_ingester_chunk_age_seconds_count
@ -199,7 +150,6 @@ metrics:
- loki_ingester_chunk_entries_sum
- loki_ingester_chunk_size_bytes_bucket
- loki_ingester_chunk_utilization_bucket
- loki_ingester_chunk_utilization_count
- loki_ingester_chunk_utilization_sum
- loki_ingester_chunks_flushed_total
- loki_ingester_flush_queue_length
@ -217,8 +167,6 @@ metrics:
- loki_ruler_wal_prometheus_remote_storage_samples_total
- loki_ruler_wal_samples_appended_total
- loki_ruler_wal_storage_created_series_total
- loki_s3_request_duration_seconds_bucket
- loki_s3_request_duration_seconds_count
- loki_write_batch_retries_total
- loki_write_dropped_bytes_total
- loki_write_dropped_entries_total
@ -226,64 +174,22 @@ metrics:
- loki_write_sent_entries_total
- node_disk_read_bytes_total
- node_disk_written_bytes_total
- process_start_time_seconds
- processor_batch_batch_send_size_ratio_bucket
- processor_batch_metadata_cardinality_ratio
- processor_batch_timeout_trigger_send_ratio_total
- prometheus_remote_storage_bytes_total
- prometheus_remote_storage_enqueue_retries_total
- prometheus_remote_storage_highest_timestamp_in_seconds
- prometheus_remote_storage_metadata_bytes_total
- prometheus_remote_storage_queue_highest_sent_timestamp_seconds
- prometheus_remote_storage_samples_dropped_total
- prometheus_remote_storage_samples_failed_total
- prometheus_remote_storage_samples_pending
- prometheus_remote_storage_samples_retried_total
- prometheus_remote_storage_samples_total
- prometheus_remote_storage_sent_batch_duration_seconds_bucket
- prometheus_remote_storage_sent_batch_duration_seconds_count
- prometheus_remote_storage_sent_batch_duration_seconds_sum
- prometheus_remote_storage_shard_capacity
- prometheus_remote_storage_shards
- prometheus_remote_storage_shards_desired
- prometheus_remote_storage_shards_max
- prometheus_remote_storage_shards_min
- prometheus_remote_storage_succeeded_samples_total
- prometheus_remote_write_wal_samples_appended_total
- prometheus_remote_write_wal_storage_active_series
- prometheus_sd_discovered_targets
- prometheus_target_interval_length_seconds_count
- prometheus_target_interval_length_seconds_sum
- prometheus_target_scrapes_exceeded_sample_limit_total
- prometheus_target_scrapes_sample_duplicate_timestamp_total
- prometheus_target_scrapes_sample_out_of_bounds_total
- prometheus_target_scrapes_sample_out_of_order_total
- prometheus_target_sync_length_seconds_sum
- prometheus_wal_watcher_current_segment
- promtail_custom_bad_words_total
- promtail_dropped_bytes_total
- promtail_files_active_total
- promtail_read_bytes_total
- promtail_read_lines_total
- promtail_request_duration_seconds_bucket
- promtail_sent_entries_total
- rpc_server_duration_milliseconds_bucket
- receiver_accepted_spans_ratio_total
- receiver_refused_spans_ratio_total
- scrape_duration_seconds
- traces_exporter_sent_spans
- traces_exporter_send_failed_spans
- traces_loadbalancer_backend_outcome
- traces_loadbalancer_num_backends
- traces_receiver_accepted_spans
- traces_receiver_refused_spans
- up
# Additional metrics to retain
extraMetrics: []
# Set enabled = true to add the default logs dashboards to the local Grafana
# Set enabled = true to add the default logs/metrics/traces dashboards to the local Grafana
dashboards:
logs:
enabled: true
metrics:
enabled: true
traces:
enabled: true
global:
minio:
rootUser: "rootuser"
rootPassword: "rootpassword"
kubeStateMetrics:
# Scrape https://github.com/kubernetes/kube-state-metrics by default
enabled: true
@ -291,8 +197,10 @@ kubeStateMetrics:
# https://artifacthub.io/packages/helm/prometheus-community/kube-state-metrics/
# is used. Change this if kube-state-metrics is installed somewhere else.
endpoint: kube-state-metrics.kube-state-metrics.svc.cluster.local:8080
# The following are configuration for the dependencies.
# These should usually not be changed.
loki:
loki:
auth_enabled: false
@ -317,9 +225,9 @@ loki:
common:
storage:
s3:
access_key_id: "${rootUser}"
access_key_id: "{{ .Values.global.minio.rootUser }}"
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
secret_access_key: "${rootPassword}"
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
compactor:
retention_enabled: true
delete_request_store: s3
@ -342,24 +250,9 @@ loki:
installOperator: false
lokiCanary:
enabled: false
write:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
read:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
backend:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
test:
enabled: false
alloy:
alloy:
clustering:
@ -391,36 +284,37 @@ alloy:
maxReplicas: 30
targetMemoryUtilizationPercentage: 90
targetCPUUtilizationPercentage: 90
mimir-distributed:
minio:
enabled: false
global:
extraEnvFrom:
- secretRef:
name: "minio"
mimir:
structuredConfig:
alertmanager_storage:
s3:
bucket_name: mimir-ruler
access_key_id: "{{ .Values.global.minio.rootUser }}"
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
insecure: true
blocks_storage:
backend: s3
s3:
bucket_name: mimir-tsdb
access_key_id: "{{ .Values.global.minio.rootUser }}"
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
insecure: true
ruler_storage:
s3:
bucket_name: mimir-ruler
common:
storage:
backend: s3
s3:
bucket_name: mimir-ruler
access_key_id: "${rootUser}"
access_key_id: "{{ .Values.global.minio.rootUser }}"
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
secret_access_key: "${rootPassword}"
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
insecure: true
limits:
compactor_blocks_retention_period: 30d
tempo-distributed:
tempo:
structuredConfig:
@ -430,47 +324,22 @@ tempo-distributed:
s3:
bucket: tempo
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
access_key: "${rootUser}"
secret_key: "${rootPassword}"
access_key: "{{ .Values.global.minio.rootUser }}"
secret_key: "{{ .Values.global.minio.rootPassword }}"
insecure: true
distributor:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
ingester:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
compactor:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
querier:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
queryFrontend:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
compaction:
block_retention: 30d
traces:
otlp:
http:
enabled: true
grpc:
enabled: true
minio:
existingSecret: "minio"
rootUser: rootuser
rootPassword: rootpassword
buckets:
- name: loki-chunks
policy: none

View File

@ -1,12 +1,8 @@
# Update the dependencies
The dependencies are the versions of Loki, Mimir, Agent and so on that are included in this chart.
The dependencies are the version of Loki, Mimir, Agent and so on that are included in this chart.
The current versions can be found in the [Chart.yaml](../charts/meta-monitoring/Chart.yaml) file.
A Github action runs daily to see if updated versions are available. A PR will be created.
The manual steps are as follows:
Run this in the charts/meta-monitoring directory after updating a dependency:
```

View File

@ -4,7 +4,7 @@
1. Use an existing Grafana Cloud account or setup a new one. Then create an access token:
1. In a Grafana instance on Grafana Cloud go to Administration -> Users and Access -> Cloud access policies.
1. In Grafana go to Administration -> Users and Access -> Cloud access policies.
1. Click `Create access policy`.
@ -25,21 +25,21 @@
```
kubectl create secret generic logs -n meta \
--from-literal=username=<logs username> \
--from-literal=password=<token> \
--from-literal=password=<token>
--from-literal=endpoint='https://logs-prod-us-central1.grafana.net/loki/api/v1/push'
kubectl create secret generic metrics -n meta \
--from-literal=username=<metrics username> \
--from-literal=password=<token> \
--from-literal=password=<token>
--from-literal=endpoint='https://prometheus-us-central1.grafana.net/api/prom/push'
kubectl create secret generic traces -n meta \
--from-literal=username=<OTLP instance ID> \
--from-literal=password=<token> \
--from-literal=password=<token>
--from-literal=endpoint='https://otlp-gateway-prod-us-east-0.grafana.net/otlp'
```
The logs, metrics and traces usernames are the `User / Username / Instance IDs` of the Loki, Prometheus/Mimir and OpenTelemetry instances in Grafana Cloud. From `Home` in Grafana click on `Stacks`. Then go to the `Details` pages of Loki and Prometheus/Mimir. For OpenTelemetry go to the `Configure` page. The endpoints will also have to be changed to match your settings.
The logs, metrics and traces usernames are the `User / Username / Instance IDs` of the Loki, Prometheus/Mimir and OpenTelemetry instances in Grafana Cloud. From `Home` in Grafana click on `Stacks`. Then go to the `Details` pages of Loki and Prometheus/Mimir. For OpenTelemetry go to the `Configure` page.
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). Fill in the names of the secrets created above as needed. An example minimal values.yaml looks like this:
@ -67,14 +67,6 @@
kubectl create namespace meta
```
1. Create a secret named `minio` with the user and password for the local Minio:
```
kubectl create secret generic minio -n meta \
--from-literal=rootPassword=<password> \
--from-literal=rootUser=<user>
```
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). An example minimal values.yaml looks like this:
```
@ -102,7 +94,7 @@
enabled: true
```
## Installing, updating and deleting the chart
## Installing the chart
1. Add the repo
@ -175,7 +167,7 @@ For each of the dashboard files in charts/meta-monitoring/src/dashboards folder
## Configure Loki to send traces
1. In the Loki that is being monitored enable tracing in the config:
1. In the Loki config enable tracing:
```
loki:
@ -187,15 +179,7 @@ For each of the dashboard files in charts/meta-monitoring/src/dashboards folder
1. JAEGER_ENDPOINT: http address of the mmc-alloy service installed by the meta-monitoring chart, for example "http://mmc-alloy:14268/api/traces"
1. JAEGER_AGENT_TAGS: extra tags you would like to add to the spans, for example 'cluster="abc",namespace="def"'
1. JAEGER_SAMPLER_TYPE: the sampling strategy, we suggest setting this to `ratelimiting` so at most 1 trace is accepted per second. See these [docs](https://www.jaegertracing.io/docs/1.57/sampling/) for more options.
1. JAEGER_SAMPLER_PARAM: 1.0
1. JAEGER_SAMPLER_TYPE: the sampling strategy, for example to sample all use 'const' with a value of 1 for the next environment variable
1. JAEGER_SAMPLER_PARAM: 1
1. If Loki is installed in a different namespace you can create an [ExternalName service](https://kubernetes.io/docs/concepts/services-networking/service/#externalname) in Kubernetes to point to the mmc-alloy service in the meta monitoring namespace
## Configure external access using an Ingress in local mode
When using local mode by default a Kubernetes [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) object is created to access the Grafana instance. This will need to be adapted to your cloud provider by updating the `grafana.ingress` section of the `values.yaml` file provided to Helm. Check the documentation of your cloud provider for available options.
## Kube-state-metrics
Metrics about Kubernetes objects are scraped from [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics). This needs to be installed in the cluster. The `kubeStateMetrics.endpoint` entry in values.yaml should be set to it's address (without the `/metrics` part in the URL).