Compare commits

...

36 Commits

Author SHA1 Message Date
Michel Hollands
76908c1e9e Turn on cloud metrics and traces by default
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-10-19 10:47:12 +01:00
Michel Hollands
bc5cdadb9f Rename file and do not run ruler when no mimir
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-10-19 10:28:59 +01:00
Michel Hollands
687c77c0f6 Use cloud
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-10-19 09:31:59 +01:00
Michel Hollands
f4934d6007 Remove space
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-10-18 11:35:54 +01:00
Michel Hollands
1093e91741 Change namespace name
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:45:57 +01:00
Michel Hollands
1ed196299b Increase timeout
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:44:31 +01:00
Michel Hollands
faa0015c11 Install locally by default
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:28:47 +01:00
Michel Hollands
53416e042c Use correct namespace
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:26:35 +01:00
Michel Hollands
d804da13f1 Add test install
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:20:20 +01:00
Michel Hollands
8c0b68fe02 Fix kind.config
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:13:16 +01:00
Michel Hollands
99bb8f13c2 Apply linting 5
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:10:36 +01:00
Michel Hollands
26ff679cbb Apply linting 4
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:06:52 +01:00
Michel Hollands
fb3e3ece1b Apply linting 3
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:05:21 +01:00
Michel Hollands
7a5358b322 Apply linting 2
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:03:29 +01:00
Michel Hollands
9c92e18efe Apply linting
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 11:02:07 +01:00
Michel Hollands
ffe220590d Update dependencies
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 10:58:54 +01:00
Michel Hollands
e3708ce3fe Add ct.yaml
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 10:44:43 +01:00
Michel Hollands
3149f4df9b Add install step
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-18 10:22:14 +01:00
Michel Hollands
86ec586917 Fix typo
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-17 10:31:56 +01:00
Michel Hollands
6cd12bee01 Add linted rule files
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-17 10:29:24 +01:00
Michel Hollands
b042b396a2 Temp checkin
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-17 10:27:42 +01:00
Michel Hollands
bcacb70e2d Merge pull request #5 from grafana/add_skip_cdrs_to_installation_step
Update readme
2023-08-16 11:07:05 +01:00
Michel Hollands
d9c3b60659 Update documentation
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 11:02:16 +01:00
Michel Hollands
6d091d564e Add note about CRDs
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-16 09:47:54 +01:00
Michel Hollands
8671993962 Update readme
Signed-off-by: Michel Hollands <michel.hollands@grafana.com>
2023-08-16 09:44:36 +01:00
Michel Hollands
f80c9d7c43 Merge pull request #15 from grafana/add_retention
Add retention for Loki, Mimir and Tempo
2023-08-15 16:54:41 +01:00
Michel Hollands
60853bc8b0 Merge pull request #12 from grafana/add_agent_dashboards
Add agent dashboards
2023-08-15 16:53:39 +01:00
Michel Hollands
debdd67283 Merge pull request #13 from grafana/fix_mimir_dashboards
Fix mimir dashboards
2023-08-15 16:53:15 +01:00
Michel Hollands
8bc465b2e6 Merge pull request #14 from grafana/fix_tempo_dashboards
Fix Tempo dashboards
2023-08-15 16:53:06 +01:00
Michel Hollands
18d24c39f7 Add Loki retention
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-15 15:21:24 +01:00
Michel Hollands
23d14110a0 Add 1 day retention to Tempo and Mimir
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-15 10:47:33 +01:00
Michel Hollands
092423c2b3 Fix
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-14 15:22:40 +01:00
Michel Hollands
dcbe85a37a Fix
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-14 15:18:42 +01:00
Michel Hollands
db8558982c Also for
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-14 15:16:38 +01:00
Michel Hollands
49034b9f6b Fix dashboards
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-14 15:05:29 +01:00
Michel Hollands
aa988adb47 Add agent dashboards
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2023-08-03 15:28:47 +01:00
29 changed files with 6485 additions and 665 deletions

66
.github/workflows/helm-ci.yml vendored Normal file
View File

@@ -0,0 +1,66 @@
---
name: helm-ci
on:
pull_request:
paths:
- "charts/meta-monitoring/**"
env:
CT_CONFIGFILE: charts/meta-monitoring/ct.yaml
jobs:
call-lint:
name: Lint Helm Chart
runs-on: ubuntu-latest
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Lint Yaml
run: make helm-lint
call-test:
name: Test Helm Chart
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.8.2
# Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and
# yamllint (https://github.com/adrienverge/yamllint) which require Python
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.7
- name: Set up chart-testing
uses: helm/chart-testing-action@v2.4.0
- name: Run chart-testing (list-changed)
id: list-changed
run: |
changed=$(ct list-changed --config "${CT_CONFIGFILE}")
if [[ -n "$changed" ]]; then
echo "changed=true" >> $GITHUB_OUTPUT
fi
- name: Run chart-testing (lint)
run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false
- name: Create kind cluster
uses: helm/kind-action@v1.8.0
if: steps.list-changed.outputs.changed == 'true'
with:
config: tools/kind.config
- name: Run chart-testing (install)
run: |
changed=$(ct list-changed --config "${CT_CONFIGFILE}")
ct install --config "${CT_CONFIGFILE}"

10
Makefile Normal file
View File

@@ -0,0 +1,10 @@
# Adapted from https://www.thapaliya.com/en/writings/well-documented-makefiles/
.PHONY: help
help: ## Display this help and any documented user-facing targets. Other undocumented targets may be present in the Makefile.
help:
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make <target>\n\nTargets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf " %-45s %s\n", $$1, $$2 }' $(MAKEFILE_LIST)
.PHONY: helm-lint
helm-lint: ## Run helm linter
$(MAKE) -BC charts/meta-monitoring lint

View File

@@ -3,6 +3,8 @@
This is a meta-monitoring chart for GEL, GEM and GET. It should be installed in a
separate namespace next to GEM, GEL or GET installations.
Note that this is pre-production software at the moment.
## Preparation
Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml).
@@ -15,29 +17,54 @@ Create a values.yaml file based on the [default one](../charts/meta-monitoring/v
## Local and cloud modes
The chart has 2 modes: local and cloud. In the local mode logs, metrics and traces are sent
The chart has 2 modes: local and cloud. In the local mode logs, metrics and/or traces are sent
to small Loki, Mimir and Tempo installations running in the meta-monitoring namespace.
![local mode](docs/images/Meta%20monitoring%20local.png)
To enable local mode set `local.enabled` to true.
To enable local mode set `local.<logs|metrics|traces>.enabled` to true.
In the cloud mode the logs, metrics and traces are sent to
In the cloud mode the logs, metrics and/or traces are sent to Grafana Cloud.
![cloud mode](docs/images/Meta%20monitoring%20cloud.png)
To enable cloud mode set `cloud.enabled` to true. The `endpoint`, `username` and `password` settings for your Grafana Cloud logs, metrics and traces instances have to be filled in as well.
To enable cloud mode set `cloud.<logs|metrics|traces>.enabled` to true. The `endpoint`, `username` and `password` settings for your Grafana Cloud logs, metrics and traces instances have to be filled in as well.
Both modes can be enabled at the same time.
## Installation
```
helm install -n meta -f values.yaml meta ./charts/meta-monitoring
helm install -n meta --skip-crds -f values.yaml meta ./charts/meta-monitoring
```
If the platform supports CRDs the `--skip-crds` option can be removed. However the CRDs are not used by this chart.
For more instructions including how to update the chart go to the [installation](docs/installation.md) page.
## Supported features
- Specify which namespaces are monitored
- Specify if logs, metrics or traces should be enabled for cloud or local
- Specify the cluster name used for the logs, metrics and traces
- Specify PII regexes that are applied to logs before they are sent to Loki (cloud or local). The capture group in the regex is replaced with *****.
- a Grafana instance is installed (when local mode is used) with the relevant datasources installed. The following dashboards are installed:
- logs dashboards
- metrics dashboards
- traces dashboards
- agent dashboards
- Retention is set to 24 hours
Most of these features are enabled by default. See the values.yaml file for how to enable/disable them.
## Caveats
- The [loki.source.kubernetes](https://grafana.com/docs/agent/latest/flow/reference/components/loki.source.kubernetes/) component of the Grafana Agent is used to scrape Kubernetes log files. This component is marked experimental at the moment.
- This has not been tested on Openshift yet.
- The underlying Loki, Mimir and Tempo are at the default size installed by the Helm chart. This might need changing when monitoring bigger Loki, Mimir or Tempo installations.
- MinIO is used as storage at the moment with a limited retention. At the moment this chart cannot be used for monitoring over longer periods.
- Agent self monitoring is not done at the moment.
## Developer help topics
- [update dependencies](docs/dev_update_dependencies.md)
- [update dependencies](docs/dev_update_dependencies.md)

View File

@@ -14,5 +14,5 @@ dependencies:
- name: minio
repository: https://charts.min.io
version: 5.0.11
digest: sha256:4b04084e6fe821c4d481017b2430f7c8cd782a5d60830dd3a24eb8f10a9ece09
generated: "2023-06-29T14:25:07.247853+01:00"
digest: sha256:da0e744b5046eb7972e0bf82d1d0ba4786e9600af63b65f35b16118105248074
generated: "2023-08-18T10:58:08.978123+01:00"

View File

@@ -25,21 +25,21 @@ appVersion: "0.0.1"
dependencies:
- name: loki
repository: https://grafana.github.io/helm-charts
repository: https://grafana.github.io/helm-charts
version: "5.8.0"
condition: local.logs.enabled
- name: grafana-agent
repository: https://grafana.github.io/helm-charts
repository: https://grafana.github.io/helm-charts
version: "0.15.0"
- name: mimir-distributed
repository: https://grafana.github.io/helm-charts
repository: https://grafana.github.io/helm-charts
version: "4.4.1"
condition: local.metrics.enabled
- name: tempo-distributed
repository: https://grafana.github.io/helm-charts
repository: https://grafana.github.io/helm-charts
version: "1.4.7"
condition: local.traces.enabled
- name: minio
repository: https://charts.min.io
version: "5.0.11"
condition: local.minio.enabled
condition: local.minio.enabled

View File

@@ -0,0 +1,7 @@
.DEFAULT_GOAL := lint
.PHONY: lint lint-yaml
lint: lint-yaml
lint-yaml:
yamllint -c $(CURDIR)/src/.yamllint.yaml $(CURDIR)/src

View File

@@ -0,0 +1,11 @@
---
remote: origin
target-branch: main
chart-dirs:
- charts
chart-repos:
- grafana=https://grafana.github.io/helm-charts
- minio=https://charts.min.io
helm-extra-args: --timeout 1200s
check-version-increment: false
validate-maintainers: false

View File

@@ -0,0 +1,4 @@
---
rules:
quoted-strings:
required: true

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,786 @@
{
"annotations": {
"list": [ ]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"links": [ ],
"refresh": "30s",
"rows": [
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 12,
"stack": false,
"steppedLine": false,
"styles": [
{
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
},
{
"alias": "Count",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #A",
"thresholds": [ ],
"type": "hidden",
"unit": "short"
},
{
"alias": "Uptime",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "Value #B",
"thresholds": [ ],
"type": "number",
"unit": "short"
},
{
"alias": "Container",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "container",
"thresholds": [ ],
"type": "number",
"unit": "short"
},
{
"alias": "Pod",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "pod",
"thresholds": [ ],
"type": "number",
"unit": "short"
},
{
"alias": "Version",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"link": false,
"linkTargetBlank": false,
"linkTooltip": "Drill down",
"linkUrl": "",
"pattern": "version",
"thresholds": [ ],
"type": "number",
"unit": "short"
},
{
"alias": "",
"colorMode": null,
"colors": [ ],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"pattern": "/.*/",
"thresholds": [ ],
"type": "string",
"unit": "short"
}
],
"targets": [
{
"expr": "count by (pod, container, version) (agent_build_info{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A",
"step": 10
},
{
"expr": "max by (pod, container) (time() - process_start_time_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
"format": "table",
"instant": true,
"intervalFactor": 2,
"legendFormat": "",
"refId": "B",
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Agent Stats",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"transform": "table",
"type": "table",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Agent Stats",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])) by (pod, scrape_job) * 1e3",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}/{{scrape_job}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Target Sync",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 3,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (pod) (prometheus_sd_discovered_targets{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Targets",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Prometheus Discovery",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(prometheus_target_interval_length_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])\n/\nrate(prometheus_target_interval_length_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m])\n* 1e3\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}} {{interval}} configured",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Average Scrape Interval Duration",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 5,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "exceeded sample limit: {{job}}",
"legendLink": null,
"step": 10
},
{
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "duplicate timestamp: {{job}}",
"legendLink": null,
"step": 10
},
{
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "out of bounds: {{job}}",
"legendLink": null,
"step": 10
},
{
"expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[1m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "out of order: {{job}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Scrape failures",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job, instance_group_name) (rate(agent_wal_samples_appended_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"$container\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{job}} {{instance_group_name}}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Appended Samples",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Prometheus Retrieval",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [
"grafana-agent-mixin"
],
"templating": {
"list": [
{
"current": {
"text": "default",
"value": "default"
},
"hide": 0,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "cluster",
"multi": true,
"name": "cluster",
"options": [ ],
"query": "label_values(agent_build_info, cluster)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "namespace",
"multi": true,
"name": "namespace",
"options": [ ],
"query": "label_values(agent_build_info, namespace)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "container",
"multi": true,
"name": "container",
"options": [ ],
"query": "label_values(agent_build_info, container)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": "grafana-agent-.*",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "pod",
"multi": true,
"name": "pod",
"options": [ ],
"query": "label_values(agent_build_info{container=~\"$container\"}, pod)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Agent",
"uid": "",
"version": 0
}

View File

@@ -161,7 +161,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "rate(go_gc_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])",
"expr": "rate(go_gc_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])",
"interval": "",
"legendFormat": "{{pod}}",
"refId": "A"
@@ -256,7 +256,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}",
"expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}",
"interval": "",
"legendFormat": "{{pod}}",
"refId": "A"
@@ -351,7 +351,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}",
"expr": "go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}",
"legendFormat": "{{pod}}",
"refId": "A"
}
@@ -441,7 +441,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$component.*\", container!=\"POD\"}[$__rate_interval])",
"expr": "rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\".*$component.*\", container!=\"POD\"}[$__rate_interval])",
"interval": "",
"intervalFactor": 5,
"legendFormat": "{{pod}}",
@@ -537,7 +537,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$component.*\", container!=\"POD\"}",
"expr": "container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\".*$component.*\", container!=\"POD\"}",
"interval": "",
"legendFormat": "{{pod}}",
"refId": "A"
@@ -632,14 +632,14 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$component.*\"}[$__rate_interval])",
"expr": "rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\".*$component.*\"}[$__rate_interval])",
"hide": false,
"interval": "",
"legendFormat": "rx-{{pod}}",
"refId": "A"
},
{
"expr": "rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$component.*\"}[$__rate_interval])",
"expr": "rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\".*$component.*\"}[$__rate_interval])",
"hide": false,
"interval": "",
"legendFormat": "tx-{{pod}}",
@@ -735,7 +735,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "kubelet_volume_stats_available_bytes{cluster=\"$cluster\", namespace=\"$namespace\", persistentvolumeclaim=~\"$component.*\"}",
"expr": "kubelet_volume_stats_available_bytes{cluster=\"$cluster\", namespace=\"$namespace\", persistentvolumeclaim=~\".*$component.*\"}",
"legendFormat": "{{persistentvolumeclaim}}",
"refId": "A"
}
@@ -829,7 +829,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", app=~\"$component.*\"}[$__rate_interval])",
"expr": "rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", app=~\".*$component.*\"}[$__rate_interval])",
"interval": "",
"legendFormat": "{{exported_pod}}",
"refId": "A"
@@ -934,7 +934,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "tempodb_work_queue_length{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"} / tempodb_work_queue_max{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}",
"expr": "tempodb_work_queue_length{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"} / tempodb_work_queue_max{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}",
"legendFormat": "{{instance}}",
"refId": "A"
}
@@ -1024,17 +1024,17 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(increase(tempodb_compaction_errors_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (job)",
"expr": "sum(increase(tempodb_compaction_errors_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (job)",
"legendFormat": "compaction_err",
"refId": "B"
},
{
"expr": "sum(increase(tempodb_retention_errors_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (job)",
"expr": "sum(increase(tempodb_retention_errors_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (job)",
"legendFormat": "retention_err",
"refId": "C"
},
{
"expr": "sum(increase(tempodb_blocklist_poll_errors_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (job)",
"expr": "sum(increase(tempodb_blocklist_poll_errors_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (job)",
"legendFormat": "blocklist_err",
"refId": "D"
}
@@ -1124,18 +1124,18 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "histogram_quantile(.99, sum(rate(tempodb_blocklist_poll_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempodb_blocklist_poll_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A"
},
{
"expr": "histogram_quantile(.9, sum(rate(tempodb_blocklist_poll_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempodb_blocklist_poll_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (le))",
"legendFormat": ".9",
"refId": "B"
},
{
"expr": "histogram_quantile(.5, sum(rate(tempodb_blocklist_poll_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempodb_blocklist_poll_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C"
@@ -1227,7 +1227,7 @@
"targets": [
{
"exemplar": true,
"expr": "avg(tempodb_blocklist_length{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}) by (tenant)",
"expr": "avg(tempodb_blocklist_length{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}) by (tenant)",
"instant": false,
"interval": "",
"legendFormat": "{{tenant}}",
@@ -1319,19 +1319,19 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "histogram_quantile(.99, sum(rate(tempodb_retention_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/compactor\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempodb_retention_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*compactor\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A"
},
{
"expr": "histogram_quantile(.9, sum(rate(tempodb_retention_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/compactor\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempodb_retention_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*compactor\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"refId": "B"
},
{
"expr": "histogram_quantile(.5, sum(rate(tempodb_retention_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/compactor\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempodb_retention_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*compactor\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C"
@@ -1422,13 +1422,13 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(increase(tempodb_retention_deleted_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval]))",
"expr": "sum(increase(tempodb_retention_deleted_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval]))",
"interval": "",
"legendFormat": "deleted",
"refId": "A"
},
{
"expr": "sum(increase(tempodb_retention_marked_for_deletion_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval]))",
"expr": "sum(increase(tempodb_retention_marked_for_deletion_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval]))",
"interval": "",
"legendFormat": "marked_for_deletion",
"refId": "B"
@@ -2049,7 +2049,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(rate(tempo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", route=~\".*api_traces_traceid\", job=\"$namespace/query-frontend\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", route=~\".*api_traces_traceid\", job=\"$namespace/.*query-frontend\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}}",
@@ -2145,19 +2145,19 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_traces_traceid\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_traces_traceid\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A"
},
{
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_traces_traceid\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_traces_traceid\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"refId": "B"
},
{
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_traces_traceid\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_traces_traceid\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C"
@@ -2253,7 +2253,7 @@
"targets": [
{
"exemplar": true,
"expr": "sum(rate(tempo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", route=~\".*api_search.*\", job=\"$namespace/query-frontend\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", route=~\".*api_search.*\", job=\"$namespace/.*query-frontend\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}}",
@@ -2351,7 +2351,7 @@
"targets": [
{
"exemplar": true,
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A",
@@ -2359,7 +2359,7 @@
},
{
"exemplar": true,
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"refId": "B",
@@ -2367,7 +2367,7 @@
},
{
"exemplar": true,
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C",
@@ -2463,7 +2463,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(rate(tempo_request_duration_seconds_count{route=~\"querier_.*api_traces_traceid\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{route=~\"querier_.*api_traces_traceid\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}}",
@@ -2559,19 +2559,19 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_traces_traceid\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_traces_traceid\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A"
},
{
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_traces_traceid\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_traces_traceid\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"refId": "B"
},
{
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_traces_traceid\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_traces_traceid\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C"
@@ -2667,7 +2667,7 @@
"targets": [
{
"exemplar": true,
"expr": "sum(rate(tempo_request_duration_seconds_count{route=~\"querier_.*api_search.*\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{route=~\"querier_.*api_search.*\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}}",
@@ -2765,7 +2765,7 @@
"targets": [
{
"exemplar": true,
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A",
@@ -2773,7 +2773,7 @@
},
{
"exemplar": true,
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"refId": "B",
@@ -2781,7 +2781,7 @@
},
{
"exemplar": true,
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C",
@@ -2877,7 +2877,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(rate(tempo_request_duration_seconds_count{route=\"/tempopb.Querier/FindTraceByID\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{route=\"/tempopb.Querier/FindTraceByID\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}}",
@@ -2973,19 +2973,19 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=\"/tempopb.Querier/FindTraceByID\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=\"/tempopb.Querier/FindTraceByID\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A"
},
{
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=\"/tempopb.Querier/FindTraceByID\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=\"/tempopb.Querier/FindTraceByID\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"refId": "B"
},
{
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=\"/tempopb.Querier/FindTraceByID\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=\"/tempopb.Querier/FindTraceByID\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C"
@@ -3081,7 +3081,7 @@
"targets": [
{
"exemplar": true,
"expr": "sum(rate(tempo_request_duration_seconds_count{route=~\"/tempopb.Querier/Search.*\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{route=~\"/tempopb.Querier/Search.*\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}}",
@@ -3179,7 +3179,7 @@
"targets": [
{
"exemplar": true,
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=~\"/tempopb.Querier/Search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=~\"/tempopb.Querier/Search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A",
@@ -3187,7 +3187,7 @@
},
{
"exemplar": true,
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=~\"/tempopb.Querier/Search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=~\"/tempopb.Querier/Search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"refId": "B",
@@ -3195,7 +3195,7 @@
},
{
"exemplar": true,
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=~\"/tempopb.Querier/Search.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=~\"/tempopb.Querier/Search.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C",
@@ -3527,7 +3527,7 @@
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(rate(tempo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", route=~\".*api_metrics.*\", job=\"$namespace/query-frontend\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", route=~\".*api_metrics.*\", job=\"$namespace/.*query-frontend\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}}",
@@ -3632,7 +3632,7 @@
},
"editorMode": "code",
"exemplar": true,
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_metrics.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_metrics.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"range": true,
@@ -3646,7 +3646,7 @@
},
"editorMode": "code",
"exemplar": true,
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_metrics.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_metrics.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"range": true,
@@ -3660,7 +3660,7 @@
},
"editorMode": "code",
"exemplar": true,
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/query-frontend\", route=~\".*api_metrics.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*query-frontend\", route=~\".*api_metrics.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"range": true,
@@ -3763,7 +3763,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "sum(rate(tempo_request_duration_seconds_count{route=~\"querier_.*api_metrics.*\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{route=~\"querier_.*api_metrics.*\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}} {{route}}",
@@ -3866,7 +3866,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_metrics.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_metrics.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"range": true,
@@ -3878,7 +3878,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_metrics.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_metrics.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"range": true,
@@ -3890,7 +3890,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", route=~\"querier_.*api_metrics.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*querier\", route=~\"querier_.*api_metrics.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"range": true,
@@ -3992,7 +3992,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "sum(rate(tempo_request_duration_seconds_count{route=\"/tempopb.MetricsGenerator/GetMetrics\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/metrics-generator\"}[$__rate_interval])) by (status_code)",
"expr": "sum(rate(tempo_request_duration_seconds_count{route=\"/tempopb.MetricsGenerator/GetMetrics\", cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*metrics-generator\"}[$__rate_interval])) by (status_code)",
"hide": false,
"interval": "",
"legendFormat": "{{status_code}}",
@@ -4095,7 +4095,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/metrics-generator\", route=\"/tempopb.MetricsGenerator/GetMetrics\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*metrics-generator\", route=\"/tempopb.MetricsGenerator/GetMetrics\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"range": true,
@@ -4107,7 +4107,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/metrics-generator\", route=\"/tempopb.MetricsGenerator/GetMetrics\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*metrics-generator\", route=\"/tempopb.MetricsGenerator/GetMetrics\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"range": true,
@@ -4119,7 +4119,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/metrics-generator\", route=\"/tempopb.MetricsGenerator/GetMetrics\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*metrics-generator\", route=\"/tempopb.MetricsGenerator/GetMetrics\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"range": true,
@@ -4437,7 +4437,7 @@
"pluginVersion": "9.0.0-d452322apre",
"targets": [
{
"expr": "sum(increase(tempo_ingester_blocks_flushed_total{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"}[1h]))",
"expr": "sum(increase(tempo_ingester_blocks_flushed_total{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\"}[1h]))",
"interval": "",
"legendFormat": "{{pod}}",
"refId": "A"
@@ -5132,19 +5132,19 @@
"pluginVersion": "9.0.0-d452322apre",
"targets": [
{
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"refId": "A"
},
{
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"refId": "B"
},
{
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"refId": "C"
@@ -5359,7 +5359,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/metrics-generator\", route=~\"/tempopb.MetricsGenerator/PushSpans\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*metrics-generator\", route=~\"/tempopb.MetricsGenerator/PushSpans\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".99",
"range": true,
@@ -5371,7 +5371,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/metrics-generator\", route=~\"/tempopb.MetricsGenerator/PushSpans\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*metrics-generator\", route=~\"/tempopb.MetricsGenerator/PushSpans\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".9",
"range": true,
@@ -5383,7 +5383,7 @@
"uid": "cortex-ops-01"
},
"editorMode": "code",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/metrics-generator\", route=~\"/tempopb.MetricsGenerator/PushSpans\"}[$__rate_interval])) by (le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*metrics-generator\", route=~\"/tempopb.MetricsGenerator/PushSpans\"}[$__rate_interval])) by (le))",
"interval": "",
"legendFormat": ".5",
"range": true,
@@ -5496,7 +5496,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(rate(tempo_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (status_code, method)",
"expr": "sum(rate(tempo_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (status_code, method)",
"interval": "",
"legendFormat": "{{status_code}}-{{method}}",
"refId": "A"
@@ -5590,19 +5590,19 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "histogram_quantile(.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (method, le))",
"expr": "histogram_quantile(.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (method, le))",
"interval": "",
"legendFormat": ".99-{{method}}",
"refId": "A"
},
{
"expr": "histogram_quantile(.9, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (method, le))",
"expr": "histogram_quantile(.9, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (method, le))",
"interval": "",
"legendFormat": ".9-{{method}}",
"refId": "B"
},
{
"expr": "histogram_quantile(.5, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (method, le))",
"expr": "histogram_quantile(.5, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (method, le))",
"interval": "",
"legendFormat": ".5-{{method}}",
"refId": "C"
@@ -5714,7 +5714,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(rate(tempodb_backend_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (status_code, operation)",
"expr": "sum(rate(tempodb_backend_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (status_code, operation)",
"interval": "",
"legendFormat": "{{status_code}}-{{operation}}",
"refId": "A"
@@ -5808,17 +5808,17 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "histogram_quantile(.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (operation, le))",
"expr": "histogram_quantile(.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (operation, le))",
"legendFormat": ".99-{{operation}}",
"refId": "A"
},
{
"expr": "histogram_quantile(.9, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (operation, le))",
"expr": "histogram_quantile(.9, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (operation, le))",
"legendFormat": ".9-{{operation}}",
"refId": "B"
},
{
"expr": "histogram_quantile(.5, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}[$__rate_interval])) by (operation, le))",
"expr": "histogram_quantile(.5, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}[$__rate_interval])) by (operation, le))",
"legendFormat": ".5-{{operation}}",
"refId": "C"
}
@@ -5934,7 +5934,7 @@
"type": "prometheus",
"uid": "P666011C0B63BDCA4"
},
"expr": "gauge_memberlist_health_score{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}",
"expr": "gauge_memberlist_health_score{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}",
"interval": "",
"legendFormat": "{{instance}}",
"refId": "A"
@@ -6028,7 +6028,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "tempo_memberlist_client_cluster_node_health_score{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}",
"expr": "tempo_memberlist_client_cluster_node_health_score{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"}",
"interval": "",
"legendFormat": "{{instance}}",
"refId": "A"
@@ -6122,13 +6122,13 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "min(tempo_memberlist_client_cluster_members_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"})",
"expr": "min(tempo_memberlist_client_cluster_members_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"})",
"interval": "",
"legendFormat": "min",
"refId": "A"
},
{
"expr": "max(tempo_memberlist_client_cluster_members_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"})",
"expr": "max(tempo_memberlist_client_cluster_members_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"})",
"interval": "",
"legendFormat": "max",
"refId": "B"
@@ -6227,7 +6227,7 @@
"type": "prometheus",
"uid": "P666011C0B63BDCA4"
},
"expr": "min(tempo_memberlist_client_kv_store_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"})",
"expr": "min(tempo_memberlist_client_kv_store_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"})",
"interval": "",
"legendFormat": "min",
"refId": "A"
@@ -6237,7 +6237,7 @@
"type": "prometheus",
"uid": "P666011C0B63BDCA4"
},
"expr": "max(tempo_memberlist_client_kv_store_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"})",
"expr": "max(tempo_memberlist_client_kv_store_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*$component\"})",
"interval": "",
"legendFormat": "max",
"refId": "B"
@@ -6516,7 +6516,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(rate(tempodb_compaction_objects_combined_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/compactor\"}[$__rate_interval])) by (level)",
"expr": "sum(rate(tempodb_compaction_objects_combined_total{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/.*compactor\"}[$__rate_interval])) by (level)",
"interval": "",
"legendFormat": "",
"refId": "A"
@@ -6608,7 +6608,7 @@
"pluginVersion": "9.0.0-d373beebpre",
"targets": [
{
"expr": "sum(rate(tempodb_compaction_objects_written_total{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/compactor\"}[$__rate_interval])) by (level)",
"expr": "sum(rate(tempodb_compaction_objects_written_total{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*compactor\"}[$__rate_interval])) by (level)",
"interval": "",
"legendFormat": "",
"refId": "A"
@@ -6701,7 +6701,7 @@
"uid": "P666011C0B63BDCA4"
},
"editorMode": "builder",
"expr": "sum(rate(tempodb_compaction_bytes_written_total{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/compactor\"}[$__rate_interval])) by (level)",
"expr": "sum(rate(tempodb_compaction_bytes_written_total{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*compactor\"}[$__rate_interval])) by (level)",
"interval": "",
"legendFormat": "__auto",
"range": true,
@@ -6795,7 +6795,7 @@
"uid": "P666011C0B63BDCA4"
},
"editorMode": "code",
"expr": "sum(increase(tempodb_compaction_blocks_total{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/compactor\"}[5m])) by (level)",
"expr": "sum(increase(tempodb_compaction_blocks_total{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/.*compactor\"}[5m])) by (level)",
"interval": "",
"legendFormat": "__auto",
"range": true,

View File

@@ -282,7 +282,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -369,7 +369,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__interval])) by (le,route)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\",route=~\"api_.*\"}[$__interval])) by (le,route)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -378,7 +378,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__interval])) by (le,route)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\",route=~\"api_.*\"}[$__interval])) by (le,route)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -387,7 +387,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__interval])) by (route)",
"expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\",route=~\"api_.*\"}[$__interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\",route=~\"api_.*\"}[$__interval])) by (route)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -492,7 +492,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"querier_api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\", route=~\"querier_api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -579,7 +579,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__interval])) by (le,route)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",route=~\"querier_api_.*\"}[$__interval])) by (le,route)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -588,7 +588,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__interval])) by (le,route)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",route=~\"querier_api_.*\"}[$__interval])) by (le,route)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -597,7 +597,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__interval])) by (route)",
"expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",route=~\"querier_api_.*\"}[$__interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",route=~\"querier_api_.*\"}[$__interval])) by (route)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -702,7 +702,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -789,7 +789,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__interval])) by (le,endpoint)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\"}[$__interval])) by (le,endpoint)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -798,7 +798,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__interval])) by (le,endpoint)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\"}[$__interval])) by (le,endpoint)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -807,7 +807,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__interval])) by (endpoint)",
"expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\"}[$__interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\"}[$__interval])) by (endpoint)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -912,7 +912,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -999,7 +999,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__interval])) by (le,route)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",route=~\"/tempopb.Querier/.*\"}[$__interval])) by (le,route)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1008,7 +1008,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__interval])) by (le,route)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",route=~\"/tempopb.Querier/.*\"}[$__interval])) by (le,route)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1017,7 +1017,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__interval])) by (route)",
"expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",route=~\"/tempopb.Querier/.*\"}[$__interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",route=~\"/tempopb.Querier/.*\"}[$__interval])) by (route)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1122,7 +1122,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1209,7 +1209,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1218,7 +1218,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1227,7 +1227,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__interval])) by ()",
"expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__interval])) by ()",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1332,7 +1332,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",operation=\"GET\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1419,7 +1419,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",operation=\"GET\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1428,7 +1428,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",operation=\"GET\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1437,7 +1437,7 @@
"step": 10
},
{
"expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__interval])) by ()",
"expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",operation=\"GET\"}[$__interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\",operation=\"GET\"}[$__interval])) by ()",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,

View File

@@ -621,7 +621,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})",
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -948,7 +948,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ingester\"})",
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1275,7 +1275,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\"})",
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*metrics-generator\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1602,7 +1602,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\"})",
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1929,7 +1929,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/querier\"})",
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*querier\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -2256,7 +2256,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})",
"expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,

View File

@@ -89,7 +89,7 @@
],
"targets": [
{
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})\n) by (limit_name)\n",
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",user=\"$tenant\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\"})\n) by (limit_name)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -198,7 +198,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tempo_distributor_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval]))",
"expr": "sum(rate(tempo_distributor_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\",tenant=\"$tenant\"}[$__rate_interval]))",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -207,7 +207,7 @@
"step": 10
},
{
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"ingestion_rate_limit_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"ingestion_rate_limit_bytes\"})\n) by (ingestion_rate_limit_bytes)\n",
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",user=\"$tenant\",limit_name=\"ingestion_rate_limit_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",limit_name=\"ingestion_rate_limit_bytes\"})\n) by (ingestion_rate_limit_bytes)\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -216,7 +216,7 @@
"step": 10
},
{
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"ingestion_burst_size_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"ingestion_burst_size_bytes\"})\n) by (ingestion_burst_size_bytes)\n",
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",user=\"$tenant\",limit_name=\"ingestion_burst_size_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",limit_name=\"ingestion_burst_size_bytes\"})\n) by (ingestion_burst_size_bytes)\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -303,7 +303,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tempo_distributor_spans_received_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval]))",
"expr": "sum(rate(tempo_distributor_spans_received_total{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\",tenant=\"$tenant\"}[$__rate_interval]))",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -312,7 +312,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_discarded_spans_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval])) by (reason)",
"expr": "sum(rate(tempo_discarded_spans_total{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\",tenant=\"$tenant\"}[$__rate_interval])) by (reason)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -408,7 +408,7 @@
"steppedLine": false,
"targets": [
{
"expr": "max(tempo_ingester_live_traces{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",tenant=\"$tenant\"})",
"expr": "max(tempo_ingester_live_traces{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",tenant=\"$tenant\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -417,7 +417,7 @@
"step": 10
},
{
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"max_global_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"max_global_traces_per_user\"})\n) by (max_global_traces_per_user)\n",
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",user=\"$tenant\",limit_name=\"max_global_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",limit_name=\"max_global_traces_per_user\"})\n) by (max_global_traces_per_user)\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -426,7 +426,7 @@
"step": 10
},
{
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"max_local_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"max_local_traces_per_user\"})\n) by (max_local_traces_per_user)\n",
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",user=\"$tenant\",limit_name=\"max_local_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",limit_name=\"max_local_traces_per_user\"})\n) by (max_local_traces_per_user)\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -525,7 +525,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",tenant=\"$tenant\",op=\"traces\"}[$__rate_interval])) by (status)",
"expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\",tenant=\"$tenant\",op=\"traces\"}[$__rate_interval])) by (status)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -612,7 +612,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",tenant=\"$tenant\",op=\"search\"}[$__rate_interval])) by (status)",
"expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/.*query-frontend\",tenant=\"$tenant\",op=\"search\"}[$__rate_interval])) by (status)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -705,7 +705,7 @@
"steppedLine": false,
"targets": [
{
"expr": "avg(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",tenant=\"$tenant\"})",
"expr": "avg(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",tenant=\"$tenant\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -786,7 +786,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(tempodb_compaction_outstanding_blocks{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",tenant=\"$tenant\"})\n/\ncount(tempo_build_info{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})\n",
"expr": "sum(tempodb_compaction_outstanding_blocks{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",tenant=\"$tenant\"})\n/\ncount(tempo_build_info{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\"})\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -879,7 +879,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tempo_metrics_generator_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\",tenant=\"$tenant\"}[$__rate_interval]))",
"expr": "sum(rate(tempo_metrics_generator_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/.*metrics-generator\",tenant=\"$tenant\"}[$__rate_interval]))",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -970,7 +970,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(tempo_metrics_generator_registry_active_series{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\",tenant=\"$tenant\"})",
"expr": "sum(tempo_metrics_generator_registry_active_series{cluster=~\"$cluster\", job=~\"($namespace)/.*metrics-generator\",tenant=\"$tenant\"})",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -979,7 +979,7 @@
"step": 10
},
{
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"metrics_generator_max_active_series\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"metrics_generator_max_active_series\"})\n) by (metrics_generator_max_active_series)\n",
"expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",user=\"$tenant\",limit_name=\"metrics_generator_max_active_series\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",limit_name=\"metrics_generator_max_active_series\"})\n) by (metrics_generator_max_active_series)\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1131,7 +1131,7 @@
"options": [
],
"query": "label_values(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"}, tenant)",
"query": "label_values(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\"}, tenant)",
"refresh": 1,
"regex": "",
"sort": 2,

View File

@@ -399,7 +399,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tempo_receiver_accepted_spans{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__interval]))",
"expr": "sum(rate(tempo_receiver_accepted_spans{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\"}[$__interval]))",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -408,7 +408,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_receiver_refused_spans{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__interval]))",
"expr": "sum(rate(tempo_receiver_refused_spans{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\"}[$__interval]))",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -495,7 +495,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -504,7 +504,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -513,7 +513,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_distributor_push_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__interval])) by () * 1e3 / sum(rate(tempo_distributor_push_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__interval])) by ()",
"expr": "sum(rate(tempo_distributor_push_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\"}[$__interval])) by () * 1e3 / sum(rate(tempo_distributor_push_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*distributor\"}[$__interval])) by ()",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -618,7 +618,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -705,7 +705,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -714,7 +714,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -723,7 +723,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__interval])) by () * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__interval])) by ()",
"expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__interval])) by () * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__interval])) by ()",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -828,7 +828,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -915,7 +915,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",method=\"Memcache.Put\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -924,7 +924,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",method=\"Memcache.Put\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -933,7 +933,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__interval])) by ()",
"expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",method=\"Memcache.Put\"}[$__interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",method=\"Memcache.Put\"}[$__interval])) by ()",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1038,7 +1038,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1125,7 +1125,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",operation=~\"(PUT|POST)\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1134,7 +1134,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",operation=~\"(PUT|POST)\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1143,7 +1143,7 @@
"step": 10
},
{
"expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__interval])) by ()",
"expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",operation=~\"(PUT|POST)\"}[$__interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*ingester\",operation=~\"(PUT|POST)\"}[$__interval])) by ()",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1248,7 +1248,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1335,7 +1335,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",method=\"Memcache.Put\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1344,7 +1344,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",method=\"Memcache.Put\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1353,7 +1353,7 @@
"step": 10
},
{
"expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__interval])) by ()",
"expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",method=\"Memcache.Put\"}[$__interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",method=\"Memcache.Put\"}[$__interval])) by ()",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1458,7 +1458,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1545,7 +1545,7 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",operation=~\"(PUT|POST)\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1554,7 +1554,7 @@
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__interval])) by (le,)) * 1e3",
"expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",operation=~\"(PUT|POST)\"}[$__interval])) by (le,)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
@@ -1563,7 +1563,7 @@
"step": 10
},
{
"expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__interval])) by ()",
"expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",operation=~\"(PUT|POST)\"}[$__interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/.*compactor\",operation=~\"(PUT|POST)\"}[$__interval])) by ()",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,

View File

@@ -1,53 +1,53 @@
groups:
- name: loki_rules
- name: "loki_rules"
rules:
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:loki_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:loki_request_duration_seconds:50quantile"
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (cluster, job)"
record: "cluster_job:loki_request_duration_seconds:avg"
- expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)"
record: "cluster_job:loki_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)"
record: "cluster_job:loki_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)"
record: "cluster_job:loki_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))"
record: "cluster_job_route:loki_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))"
record: "cluster_job_route:loki_request_duration_seconds:50quantile"
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)"
record: "cluster_job_route:loki_request_duration_seconds:avg"
- expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)"
record: "cluster_job_route:loki_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)"
record: "cluster_job_route:loki_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)"
record: "cluster_job_route:loki_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:loki_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:loki_request_duration_seconds:50quantile"
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
namespace, job, route)"
record: "cluster_namespace_job_route:loki_request_duration_seconds:avg"
- expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate"

View File

@@ -1,322 +1,322 @@
groups:
- name: mimir_api_1
- name: "mimir_api_1"
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
- name: mimir_api_2
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
by (cluster, job)"
record: "cluster_job:cortex_request_duration_seconds:avg"
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)"
record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)"
record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_request_duration_seconds_count:sum_rate"
- name: "mimir_api_2"
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
- name: mimir_api_3
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))"
record: "cluster_job_route:cortex_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))"
record: "cluster_job_route:cortex_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)"
record: "cluster_job_route:cortex_request_duration_seconds:avg"
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)"
record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)"
record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)"
record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate"
- name: "mimir_api_3"
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
- expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
- name: mimir_querier_api
namespace, job, route)"
record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg"
- expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)"
record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate"
- name: "mimir_querier_api"
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_querier_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_querier_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job)"
record: "cluster_job:cortex_querier_request_duration_seconds:avg"
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job)"
record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job)"
record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job)"
record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))"
record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))"
record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
(cluster, job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
(cluster, job, route)"
record: "cluster_job_route:cortex_querier_request_duration_seconds:avg"
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job, route)"
record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route)"
record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job, route)"
record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
by (cluster, namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate
- name: mimir_cache
by (cluster, namespace, job, route)"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg"
- expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
namespace, job, route)"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route)"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)"
record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate"
- name: "mimir_cache"
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
- expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))"
record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))"
record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
by (cluster, job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds:avg
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
by (cluster, job, method)"
record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg"
- expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)"
record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method)"
record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
job, method)"
record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_cache_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_cache_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_cache_request_duration_seconds:avg"
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job)"
record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)"
record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job)"
record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))"
record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))"
record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method)
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
- name: mimir_storage
job, method)"
record: "cluster_job_method:cortex_cache_request_duration_seconds:avg"
- expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)"
record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method)"
record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)"
record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate"
- name: "mimir_storage"
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds:avg
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
- name: mimir_queries
- expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_kv_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_kv_request_duration_seconds:50quantile"
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_kv_request_duration_seconds:avg"
- expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
job)"
record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)"
record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate"
- name: "mimir_queries"
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:50quantile
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
by (cluster, job)
record: cluster_job:cortex_query_frontend_retries:avg
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_query_frontend_retries:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_query_frontend_retries:50quantile"
- expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
by (cluster, job)"
record: "cluster_job:cortex_query_frontend_retries:avg"
- expr: "sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)"
record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate"
- expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)"
record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate"
- expr: "sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_query_frontend_retries_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile"
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
(cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
- name: mimir_ingester_queries
(cluster, job)"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg"
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
cluster, job)"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job)"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate"
- expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
job)"
record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate"
- name: "mimir_ingester_queries"
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:50quantile
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_series:avg
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:50quantile
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples:avg
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_exemplars:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_exemplars:50quantile
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_exemplars:avg
- expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate
- name: mimir_received_samples
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_series:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_series:50quantile"
- expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_series:avg"
- expr: "sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)"
record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_series_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_samples:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_samples:50quantile"
- expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_samples:avg"
- expr: "sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)"
record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate"
- expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_exemplars:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
by (le, cluster, job))"
record: "cluster_job:cortex_ingester_queried_exemplars:50quantile"
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_exemplars:avg"
- expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
job)"
record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate"
- expr: "sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)"
record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate"
- name: "mimir_received_samples"
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m
- name: mimir_exemplars_in
- expr: "|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))"
record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m"
- name: "mimir_exemplars_in"
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))
record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m
- name: mimir_received_exemplars
- expr: "|
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))"
record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m"
- name: "mimir_received_exemplars"
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))
record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m
- name: mimir_exemplars_ingested
- expr: "|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))"
record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m"
- name: "mimir_exemplars_ingested"
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))
record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m
- name: mimir_exemplars_appended
- expr: "|
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))"
record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m"
- name: "mimir_exemplars_appended"
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))
record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m
- name: mimir_scaling_rules
- expr: "|
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))"
record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m"
- name: "mimir_scaling_rules"
rules:
- expr: |
- expr: "|
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
sum by (cluster, namespace, deployment) (
label_replace(
kube_deployment_spec_replicas,
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
)
)
or
sum by (cluster, namespace, deployment) (
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
)
record: cluster_namespace_deployment:actual_replicas:count
- expr: |
label_replace(kube_statefulset_replicas, \"deployment\", \"$1\", \"statefulset\", \"(.*?)(?:-zone-[a-z])?\")
)"
record: "cluster_namespace_deployment:actual_replicas:count"
- expr: "|
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
@@ -324,21 +324,21 @@ groups:
)[24h:]
)
/ 240000
)
)"
labels:
deployment: distributor
reason: sample_rate
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
deployment: "distributor"
reason: "sample_rate"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: "|
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"})
* 0.59999999999999998 / 240000
)
)"
labels:
deployment: distributor
reason: sample_rate_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
deployment: "distributor"
reason: "sample_rate_limits"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: "|
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
@@ -346,12 +346,12 @@ groups:
)[24h:]
)
* 3 / 80000
)
)"
labels:
deployment: ingester
reason: sample_rate
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
deployment: "ingester"
reason: "sample_rate"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: "|
ceil(
quantile_over_time(0.99,
sum by(cluster, namespace) (
@@ -359,59 +359,59 @@ groups:
)[24h:]
)
/ 1500000
)
)"
labels:
deployment: ingester
reason: active_series
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
deployment: "ingester"
reason: "active_series"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: "|
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"max_global_series_per_user\"})
* 3 * 0.59999999999999998 / 1500000
)
)"
labels:
deployment: ingester
reason: active_series_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
deployment: "ingester"
reason: "active_series_limits"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: "|
ceil(
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"})
* 0.59999999999999998 / 80000
)
)"
labels:
deployment: ingester
reason: sample_rate_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
deployment: "ingester"
reason: "sample_rate_limits"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: "|
ceil(
(sum by (cluster, namespace) (
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
cortex_ingester_tsdb_storage_blocks_bytes{job=~\".+/ingester.*\"}
) / 4)
/
avg by (cluster, namespace) (
memcached_limit_bytes{job=~".+/memcached"}
memcached_limit_bytes{job=~\".+/memcached\"}
)
)
)"
labels:
deployment: memcached
reason: active_series
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
deployment: "memcached"
reason: "active_series"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: "|
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
)
)
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate
- expr: |
)"
record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate"
- expr: "|
# Convenience rule to get the CPU request for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
@@ -424,11 +424,11 @@ groups:
label_replace(
label_replace(
kube_pod_container_resource_requests_cpu_cores,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
)
)
)
@@ -439,17 +439,17 @@ groups:
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="cpu"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
kube_pod_container_resource_requests{resource=\"cpu\"},
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
)
)
)
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
- expr: |
)"
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum"
- expr: "|
# Jobs should be sized to their CPU usage.
# We do this by comparing 99th percentile usage over the last 24hrs to
# their current provisioned #replicas and resource requests.
@@ -459,28 +459,28 @@ groups:
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
/
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
)
)"
labels:
reason: cpu_usage
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
reason: "cpu_usage"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- expr: "|
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
container_memory_usage_bytes{image!=""},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
container_memory_usage_bytes{image!=\"\"},
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
)
)
record: cluster_namespace_deployment:container_memory_usage_bytes:sum
- expr: |
)"
record: "cluster_namespace_deployment:container_memory_usage_bytes:sum"
- expr: "|
# Convenience rule to get the Memory request for both a deployment and a statefulset.
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
# Multi-zone deployments are grouped together removing the \"zone-X\" suffix.
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
@@ -493,11 +493,11 @@ groups:
label_replace(
label_replace(
kube_pod_container_resource_requests_memory_bytes,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
)
)
)
@@ -508,17 +508,17 @@ groups:
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="memory"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
kube_pod_container_resource_requests{resource=\"memory\"},
\"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
\"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\"
)
)
)
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
- expr: |
)"
record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum"
- expr: "|
# Jobs should be sized to their Memory usage.
# We do this by comparing 99th percentile usage over the last 24hrs to
# their current provisioned #replicas and resource requests.
@@ -528,44 +528,44 @@ groups:
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
/
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
)
)"
labels:
reason: memory_usage
record: cluster_namespace_deployment_reason:required_replicas:count
- name: mimir_alertmanager_rules
reason: "memory_usage"
record: "cluster_namespace_deployment_reason:required_replicas:count"
- name: "mimir_alertmanager_rules"
rules:
- expr: |
sum by (cluster, job, pod) (cortex_alertmanager_alerts)
record: cluster_job_pod:cortex_alertmanager_alerts:sum
- expr: |
sum by (cluster, job, pod) (cortex_alertmanager_silences)
record: cluster_job_pod:cortex_alertmanager_silences:sum
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m
- expr: |
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m
- expr: |
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
- name: mimir_ingester_rules
- expr: "|
sum by (cluster, job, pod) (cortex_alertmanager_alerts)"
record: "cluster_job_pod:cortex_alertmanager_alerts:sum"
- expr: "|
sum by (cluster, job, pod) (cortex_alertmanager_silences)"
record: "cluster_job_pod:cortex_alertmanager_silences:sum"
- expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))"
record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m"
- expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))"
record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m"
- expr: "|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))"
record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m"
- expr: "|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))"
record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m"
- expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))"
record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m"
- expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))"
record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m"
- expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))"
record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m"
- expr: "|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))"
record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m"
- name: "mimir_ingester_rules"
rules:
- expr: |
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
- expr: "|
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))"
record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m"

View File

@@ -1,15 +1,15 @@
groups:
- name: tempo_rules
- name: "tempo_rules"
rules:
- expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
record: cluster_namespace_job_route:tempo_request_duration_seconds:avg
- expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)
record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)
record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate
- expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate
- expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile"
- expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))"
record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile"
- expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)"
record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg"
- expr: "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)"
record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate"
- expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)"
record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate"
- expr: "sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)"
record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"

View File

@@ -0,0 +1,19 @@
{{- if .Values.dashboards.traces.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: agent-dashboards-1
namespace: {{ $.Release.Namespace }}
data:
"agent-logs-pipeline.json": |
{{ $.Files.Get "src/dashboards/agent-logs-pipeline.json" | fromJson | toJson }}
"agent-operational.json": |
{{ $.Files.Get "src/dashboards/agent-operational.json" | fromJson | toJson }}
"agent-remote-write.json": |
{{ $.Files.Get "src/dashboards/agent-remote-write.json" | fromJson | toJson }}
"agent-tracing-pipeline.json": |
{{ $.Files.Get "src/dashboards/agent-tracing-pipeline.json" | fromJson | toJson }}
"agent.json": |
{{ $.Files.Get "src/dashboards/agent.json" | fromJson | toJson }}
{{- end }}

View File

@@ -80,4 +80,12 @@ data:
orgId: 1
type: file
{{- end }}
- disableDeletion: true
editable: false
folder: Agent
name: agent-1
options:
path: /var/lib/grafana/dashboards/agent-1
orgId: 1
type: file
{{- end }}

View File

@@ -91,6 +91,8 @@ spec:
- mountPath: /var/lib/grafana/dashboards/tempo-1
name: tempo-dashboards-1
{{- end }}
- mountPath: /var/lib/grafana/dashboards/agent-1
name: agent-dashboards-1
volumes:
- name: grafana-pv
persistentVolumeClaim:
@@ -131,6 +133,9 @@ spec:
configMap:
name: tempo-dashboards-1
{{- end }}
- name: agent-dashboards-1
configMap:
name: agent-dashboards-1
---
apiVersion: v1

View File

@@ -1,9 +1,10 @@
{{- if .Values.local.metrics.enabled }}
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: meta-mimir-ruler-for-dashboards
namespace: meta
namespace: {{ $.Release.Namespace }}
spec:
progressDeadlineSeconds: 600
replicas: 1
@@ -24,7 +25,7 @@ spec:
app.kubernetes.io/component: ruler-for-dashboards
app.kubernetes.io/instance: meta
app.kubernetes.io/name: mimir
namespace: meta
namespace: {{ $.Release.Namespace }}
spec:
containers:
- args:
@@ -91,8 +92,6 @@ spec:
runAsUser: 10001
seccompProfile:
type: RuntimeDefault
serviceAccount: meta-mimir
serviceAccountName: meta-mimir
terminationGracePeriodSeconds: 180
topologySpreadConstraints:
- labelSelector:
@@ -124,3 +123,4 @@ spec:
name: rules
name: rules
{{- end }}
{{- end }}

View File

@@ -1,3 +1,4 @@
{{- if .Values.local.metrics.enabled }}
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
---
apiVersion: v1
@@ -16,3 +17,4 @@ data:
{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -1,9 +1,30 @@
# Specify the namespaces to monitor here
namespacesToMonitor:
- loki
- mimir
- tempo
clusterName: "meta-monitoring" # TODO check if this can be derived
# The name of the cluster where this will be installed
clusterName: "meta-monitoring"
# Set to true to write logs, metrics or traces to Grafana Cloud
cloud:
logs:
enabled: true
endpoint: to_be_changed
username: to_be_changed
password: to_be_changed
metrics:
enabled: true
endpoint: to_be_changed
username: to_be_changed
password: to_be_changed
traces:
enabled: true
endpoint: to_be_changed
username: to_be_changed
password: to_be_changed
# Set to true for a local version of logs, metrics or traces
local:
logs:
enabled: false
@@ -14,24 +35,7 @@ local:
minio:
enabled: false # This should be set to true if any of the previous is enabled
cloud:
logs:
enabled: true
endpoint:
username:
password:
metrics:
enabled: true
endpoint:
username:
password:
traces:
enabled: true
endpoint:
username:
password:
# Adding regexes here will add a stage.replace block. For more information see
# Adding regexes here will add a stage.replace block for logs. For more information see
# https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block
logs:
piiRegexes:
@@ -63,7 +67,7 @@ kubeStateMetrics:
endpoint: kube-state-metrics.kube-state-metrics.svc.cluster.local:8080
# The following are configuration for the dependencies.
# These should not be changed.
# These should usually not be changed.
loki:
loki:
@@ -71,13 +75,22 @@ loki:
storage:
type: "s3"
s3:
endpoint: "meta-minio.meta.svc:9000"
access_key_id: rootuser
secret_access_key: rootpassword
insecure: true
s3ForcePathStyle: true
bucketNames:
chunks: loki-chunks
ruler: loki-ruler
structuredConfig:
common:
storage:
s3:
access_key_id: "{{ .Values.global.minio.rootUser }}"
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
compactor:
retention_enabled: true
limits_config:
retention_period: 24h
monitoring:
dashboards:
enabled: false
@@ -128,6 +141,8 @@ mimir-distributed:
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
secret_access_key: "{{ .Values.global.minio.rootPassword }}"
insecure: true
limits:
compactor_blocks_retention_period: 24h
tempo-distributed:
tempo:
@@ -141,6 +156,9 @@ tempo-distributed:
access_key: "{{ .Values.global.minio.rootUser }}"
secret_key: "{{ .Values.global.minio.rootPassword }}"
insecure: true
compactor:
compaction:
block_retention: 24h
traces:
otlp:
http:
@@ -175,4 +193,4 @@ minio:
cpu: 100m
memory: 128Mi
# Changed the mc config path to '/tmp' from '/etc' as '/etc' is only writable by root and OpenShift will not permit this.
configPathmc: "/tmp/minio/mc/"
configPathmc: "/tmp/minio/mc/"

9
tools/kind.config Normal file
View File

@@ -0,0 +1,9 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
name: meta
nodes:
- role: control-plane
- role: worker
- role: worker
- role: worker