forked from RemoteSync/grafana-meta-monitoring-chart
Compare commits
17 Commits
add_mimir_
...
fix_tempo_
Author | SHA1 | Date | |
---|---|---|---|
|
092423c2b3 | ||
|
dcbe85a37a | ||
|
db8558982c | ||
|
49034b9f6b | ||
|
6fb22ae671 | ||
|
d3878e1516 | ||
|
8ae136e0c4 | ||
|
ac3e4462f9 | ||
|
e9aab491db | ||
|
c95c0e2ca9 | ||
|
c288a80bd4 | ||
|
93cac45b2e | ||
|
6ce4be70e2 | ||
|
176312167c | ||
|
07a336d9ed | ||
|
f4d5bcc018 | ||
|
18f0dc932a |
@@ -217,15 +217,15 @@
|
|||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [ ],
|
"seriesOverrides": [ ],
|
||||||
"spaceLength": 10,
|
"spaceLength": 10,
|
||||||
"span": 6,
|
"span": 4,
|
||||||
"stack": false,
|
"stack": false,
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(increase(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
"expr": "(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"} or on() vector(0)) - on () (loki_compactor_delete_requests_processed_total{cluster=~\"$cluster\", namespace=~\"$namespace\"} or on () vector(0))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "received",
|
"legendFormat": "in progress",
|
||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
}
|
}
|
||||||
@@ -233,7 +233,7 @@
|
|||||||
"thresholds": [ ],
|
"thresholds": [ ],
|
||||||
"timeFrom": null,
|
"timeFrom": null,
|
||||||
"timeShift": null,
|
"timeShift": null,
|
||||||
"title": "Delete Requests Received / Day",
|
"title": "# of Delete Requests (received - processed) ",
|
||||||
"tooltip": {
|
"tooltip": {
|
||||||
"shared": true,
|
"shared": true,
|
||||||
"sort": 2,
|
"sort": 2,
|
||||||
@@ -293,7 +293,83 @@
|
|||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [ ],
|
"seriesOverrides": [ ],
|
||||||
"spaceLength": 10,
|
"spaceLength": 10,
|
||||||
"span": 6,
|
"span": 4,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "received",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Delete Requests Received / Day",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 1,
|
||||||
|
"id": 5,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 4,
|
||||||
"stack": false,
|
"stack": false,
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
@@ -361,7 +437,7 @@
|
|||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"id": 5,
|
"id": 6,
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -381,7 +457,247 @@
|
|||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [ ],
|
"seriesOverrides": [ ],
|
||||||
"spaceLength": 10,
|
"spaceLength": 10,
|
||||||
"span": 12,
|
"span": 4,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"}",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "{{pod}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Compactor CPU usage",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 1,
|
||||||
|
"id": 7,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 4,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"} / 1024 / 1024 ",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": " {{pod}} ",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Compactor memory usage (MiB)",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 1,
|
||||||
|
"id": 8,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 4,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "loki_boltdb_shipper_compact_tables_operation_duration_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"}",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "{{pod}}",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [ ],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Compaction run duration (seconds)",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 2,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": [ ]
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": 0,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"repeat": null,
|
||||||
|
"repeatIteration": null,
|
||||||
|
"repeatRowId": null,
|
||||||
|
"showTitle": true,
|
||||||
|
"title": "Compactor",
|
||||||
|
"titleSize": "h6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapse": false,
|
||||||
|
"height": "250px",
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"aliasColors": { },
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fill": 1,
|
||||||
|
"id": 9,
|
||||||
|
"legend": {
|
||||||
|
"avg": false,
|
||||||
|
"current": false,
|
||||||
|
"max": false,
|
||||||
|
"min": false,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"links": [ ],
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"percentage": false,
|
||||||
|
"pointradius": 5,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [ ],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"span": 6,
|
||||||
"stack": false,
|
"stack": false,
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
@@ -429,19 +745,7 @@
|
|||||||
"show": false
|
"show": false
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
},
|
||||||
],
|
|
||||||
"repeat": null,
|
|
||||||
"repeatIteration": null,
|
|
||||||
"repeatRowId": null,
|
|
||||||
"showTitle": true,
|
|
||||||
"title": "Failures",
|
|
||||||
"titleSize": "h6"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collapse": false,
|
|
||||||
"height": "250px",
|
|
||||||
"panels": [
|
|
||||||
{
|
{
|
||||||
"aliasColors": { },
|
"aliasColors": { },
|
||||||
"bars": false,
|
"bars": false,
|
||||||
@@ -449,7 +753,7 @@
|
|||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"id": 6,
|
"id": 10,
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -469,7 +773,7 @@
|
|||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [ ],
|
"seriesOverrides": [ ],
|
||||||
"spaceLength": 10,
|
"spaceLength": 10,
|
||||||
"span": 12,
|
"span": 6,
|
||||||
"stack": false,
|
"stack": false,
|
||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
@@ -523,7 +827,45 @@
|
|||||||
"repeatIteration": null,
|
"repeatIteration": null,
|
||||||
"repeatRowId": null,
|
"repeatRowId": null,
|
||||||
"showTitle": true,
|
"showTitle": true,
|
||||||
"title": "Deleted lines",
|
"title": "Deletion metrics",
|
||||||
|
"titleSize": "h6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapse": false,
|
||||||
|
"height": "250px",
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": "$loki_datasource",
|
||||||
|
"id": 11,
|
||||||
|
"span": 6,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"} |~ \"Started processing delete request|delete request for user marked as processed\" | logfmt | line_format \"{{.ts}} user={{.user}} delete_request_id={{.delete_request_id}} msg={{.msg}}\" ",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "In progress/finished",
|
||||||
|
"type": "logs"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "$loki_datasource",
|
||||||
|
"id": 12,
|
||||||
|
"span": 6,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"} |~ \"delete request for user added\" | logfmt | line_format \"{{.ts}} user={{.user}} query='{{.query}}'\"",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Requests",
|
||||||
|
"type": "logs"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"repeat": null,
|
||||||
|
"repeatIteration": null,
|
||||||
|
"repeatRowId": null,
|
||||||
|
"showTitle": true,
|
||||||
|
"title": "List of deletion requests",
|
||||||
"titleSize": "h6"
|
"titleSize": "h6"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@@ -6,7 +6,6 @@
|
|||||||
"gnetId": null,
|
"gnetId": null,
|
||||||
"graphTooltip": 0,
|
"graphTooltip": 0,
|
||||||
"hideControls": false,
|
"hideControls": false,
|
||||||
"id": 8,
|
|
||||||
"iteration": 1583185057230,
|
"iteration": 1583185057230,
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
|
@@ -1,41 +1,27 @@
|
|||||||
{
|
{
|
||||||
"annotations": {
|
"annotations": {
|
||||||
"list": [
|
"list": [ ]
|
||||||
{
|
|
||||||
"builtIn": 1,
|
|
||||||
"datasource": "-- Grafana --",
|
|
||||||
"enable": true,
|
|
||||||
"hide": true,
|
|
||||||
"iconColor": "rgba(0, 211, 255, 1)",
|
|
||||||
"name": "Annotations & Alerts",
|
|
||||||
"target": {
|
|
||||||
"limit": 100,
|
|
||||||
"matchAny": false,
|
|
||||||
"tags": [ ],
|
|
||||||
"type": "dashboard"
|
|
||||||
},
|
|
||||||
"type": "dashboard"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "${datasource}",
|
|
||||||
"enable": false,
|
|
||||||
"expr": "sum by (tenant) (changes(loki_ruler_wal_prometheus_tsdb_wal_truncations_total{tenant=~\"${tenant}\"}[$__rate_interval]))",
|
|
||||||
"iconColor": "red",
|
|
||||||
"name": "WAL Truncations",
|
|
||||||
"target": {
|
|
||||||
"queryType": "Azure Monitor",
|
|
||||||
"refId": "Anno"
|
|
||||||
},
|
|
||||||
"titleFormat": "{{tenant}}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"editable": true,
|
"editable": true,
|
||||||
"fiscalYearStartMonth": 0,
|
"fiscalYearStartMonth": 0,
|
||||||
"gnetId": null,
|
"gnetId": null,
|
||||||
"graphTooltip": 0,
|
"graphTooltip": 0,
|
||||||
|
"hideControls": false,
|
||||||
"iteration": 1635347545534,
|
"iteration": 1635347545534,
|
||||||
"links": [ ],
|
"links": [
|
||||||
|
{
|
||||||
|
"asDropdown": true,
|
||||||
|
"icon": "external link",
|
||||||
|
"includeVars": true,
|
||||||
|
"keepTime": true,
|
||||||
|
"tags": [
|
||||||
|
"loki"
|
||||||
|
],
|
||||||
|
"targetBlank": false,
|
||||||
|
"title": "Loki Dashboards",
|
||||||
|
"type": "dashboards"
|
||||||
|
}
|
||||||
|
],
|
||||||
"liveNow": false,
|
"liveNow": false,
|
||||||
"panels": [
|
"panels": [
|
||||||
{
|
{
|
||||||
@@ -599,59 +585,139 @@
|
|||||||
"type": "timeseries"
|
"type": "timeseries"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"schemaVersion": 31,
|
"refresh": "10s",
|
||||||
|
"rows": [ ],
|
||||||
|
"schemaVersion": 14,
|
||||||
"style": "dark",
|
"style": "dark",
|
||||||
"tags": [ ],
|
"tags": [
|
||||||
|
"loki"
|
||||||
|
],
|
||||||
"templating": {
|
"templating": {
|
||||||
"list": [
|
"list": [
|
||||||
{
|
{
|
||||||
"description": null,
|
"current": {
|
||||||
"error": null,
|
"text": "default",
|
||||||
|
"value": "default"
|
||||||
|
},
|
||||||
"hide": 0,
|
"hide": 0,
|
||||||
"includeAll": false,
|
"label": "Data Source",
|
||||||
"label": "Datasource",
|
|
||||||
"multi": false,
|
|
||||||
"name": "datasource",
|
"name": "datasource",
|
||||||
"options": [ ],
|
"options": [ ],
|
||||||
"query": "prometheus",
|
"query": "prometheus",
|
||||||
"queryValue": "",
|
|
||||||
"refresh": 1,
|
"refresh": 1,
|
||||||
"regex": "",
|
"regex": "",
|
||||||
"skipUrlSync": false,
|
|
||||||
"type": "datasource"
|
"type": "datasource"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"allValue": null,
|
"allValue": null,
|
||||||
"datasource": "${datasource}",
|
"current": {
|
||||||
"definition": "label_values(loki_ruler_wal_samples_appended_total, tenant)",
|
"text": "prod",
|
||||||
"description": null,
|
"value": "prod"
|
||||||
"error": null,
|
},
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": false,
|
||||||
|
"label": "cluster",
|
||||||
|
"multi": false,
|
||||||
|
"name": "cluster",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "label_values(loki_build_info, cluster)",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"sort": 2,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [ ],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": null,
|
||||||
|
"current": {
|
||||||
|
"text": "prod",
|
||||||
|
"value": "prod"
|
||||||
|
},
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": false,
|
||||||
|
"label": "namespace",
|
||||||
|
"multi": false,
|
||||||
|
"name": "namespace",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "label_values(loki_build_info{cluster=~\"$cluster\"}, namespace)",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"sort": 2,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [ ],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hide": 0,
|
||||||
|
"label": null,
|
||||||
|
"name": "loki_datasource",
|
||||||
|
"options": [ ],
|
||||||
|
"query": "loki",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"type": "datasource"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": ".+",
|
||||||
|
"current": { },
|
||||||
|
"datasource": "$datasource",
|
||||||
"hide": 0,
|
"hide": 0,
|
||||||
"includeAll": true,
|
"includeAll": true,
|
||||||
"label": "Tenant",
|
"label": null,
|
||||||
"multi": true,
|
"multi": false,
|
||||||
"name": "tenant",
|
"name": "tenant",
|
||||||
"options": [ ],
|
"options": [ ],
|
||||||
"query": {
|
"query": "query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{cluster=\"$cluster\",namespace=\"$namespace\"},\"id\",\"$1\",\"tenant\",\"(.*)\")) by(id))",
|
||||||
"query": "label_values(loki_ruler_wal_samples_appended_total, tenant)",
|
"refresh": 0,
|
||||||
"refId": "StandardVariableQuery"
|
"regex": "/\"([^\"]+)\"/",
|
||||||
},
|
"sort": 1,
|
||||||
"refresh": 2,
|
"tagValuesQuery": "",
|
||||||
"regex": "",
|
"tags": [ ],
|
||||||
"skipUrlSync": false,
|
"tagsQuery": "",
|
||||||
"sort": 0,
|
"type": "query",
|
||||||
"type": "query"
|
"useTags": false
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"time": {
|
"time": {
|
||||||
"from": "now-6h",
|
"from": "now-1h",
|
||||||
"to": "now"
|
"to": "now"
|
||||||
},
|
},
|
||||||
"timepicker": { },
|
"timepicker": {
|
||||||
"timezone": "",
|
"refresh_intervals": [
|
||||||
"title": "Recording Rules",
|
"5s",
|
||||||
"uid": "2xKA_ZK7k",
|
"10s",
|
||||||
"version": 9,
|
"30s",
|
||||||
|
"1m",
|
||||||
|
"5m",
|
||||||
|
"15m",
|
||||||
|
"30m",
|
||||||
|
"1h",
|
||||||
|
"2h",
|
||||||
|
"1d"
|
||||||
|
],
|
||||||
|
"time_options": [
|
||||||
|
"5m",
|
||||||
|
"15m",
|
||||||
|
"1h",
|
||||||
|
"6h",
|
||||||
|
"12h",
|
||||||
|
"24h",
|
||||||
|
"2d",
|
||||||
|
"7d",
|
||||||
|
"30d"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"timezone": "utc",
|
||||||
|
"title": "Loki / Recording Rules",
|
||||||
|
"uid": "recording-rules",
|
||||||
|
"version": 0,
|
||||||
"weekStart": ""
|
"weekStart": ""
|
||||||
}
|
}
|
@@ -6,7 +6,6 @@
|
|||||||
"gnetId": null,
|
"gnetId": null,
|
||||||
"graphTooltip": 0,
|
"graphTooltip": 0,
|
||||||
"hideControls": false,
|
"hideControls": false,
|
||||||
"id": 68,
|
|
||||||
"iteration": 1588704280892,
|
"iteration": 1588704280892,
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
@@ -567,17 +566,17 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".99",
|
"legendFormat": ".99",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.75, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.75, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".9",
|
"legendFormat": ".9",
|
||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.5, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.5, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".5",
|
"legendFormat": ".5",
|
||||||
"refId": "C"
|
"refId": "C"
|
||||||
}
|
}
|
||||||
@@ -673,17 +672,17 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.99, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".99",
|
"legendFormat": ".99",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.9, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.9, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".9",
|
"legendFormat": ".9",
|
||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.5, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.5, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".5",
|
"legendFormat": ".5",
|
||||||
"refId": "C"
|
"refId": "C"
|
||||||
}
|
}
|
||||||
@@ -779,7 +778,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\"}[5m])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[5m])) by (route)",
|
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[$__rate_interval])) by (route) > 0",
|
||||||
"interval": "",
|
"interval": "",
|
||||||
"legendFormat": "{{route}}",
|
"legendFormat": "{{route}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
@@ -877,18 +876,18 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".99",
|
"legendFormat": ".99",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.9, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.9, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"hide": false,
|
"hide": false,
|
||||||
"legendFormat": ".9",
|
"legendFormat": ".9",
|
||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.5, sum by (le) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.5, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\", cluster=~\"$cluster\"})) * 1e3",
|
||||||
"hide": false,
|
"hide": false,
|
||||||
"legendFormat": ".5",
|
"legendFormat": ".5",
|
||||||
"refId": "C"
|
"refId": "C"
|
||||||
@@ -985,7 +984,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\", route=\"/logproto.Pusher/Push\"}[5m])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\"}[5m])) by (route)",
|
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route) > 0",
|
||||||
"interval": "",
|
"interval": "",
|
||||||
"legendFormat": "{{route}}",
|
"legendFormat": "{{route}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
@@ -1085,17 +1084,17 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))",
|
"expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))",
|
||||||
"legendFormat": "{{route}}-.99",
|
"legendFormat": "{{route}}-.99",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.9, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))",
|
"expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))",
|
||||||
"legendFormat": "{{route}}-.9",
|
"legendFormat": "{{route}}-.9",
|
||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.5, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))",
|
"expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))",
|
||||||
"legendFormat": "{{route}}-.5",
|
"legendFormat": "{{route}}-.5",
|
||||||
"refId": "C"
|
"refId": "C"
|
||||||
}
|
}
|
||||||
@@ -1191,17 +1190,17 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".99-{{route}}",
|
"legendFormat": ".99-{{route}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.9, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".9-{{route}}",
|
"legendFormat": ".9-{{route}}",
|
||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.5, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".5-{{route}}",
|
"legendFormat": ".5-{{route}}",
|
||||||
"refId": "C"
|
"refId": "C"
|
||||||
}
|
}
|
||||||
@@ -1297,7 +1296,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\", status_code!~\"5[0-9]{2}\"}[5m])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"}[5m])) by (route)",
|
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"}[$__rate_interval])) by (route) > 0",
|
||||||
"interval": "",
|
"interval": "",
|
||||||
"legendFormat": "{{route}}",
|
"legendFormat": "{{route}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
@@ -1396,17 +1395,17 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".99-{{route}}",
|
"legendFormat": ".99-{{route}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.9, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".9-{{route}}",
|
"legendFormat": ".9-{{route}}",
|
||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.5, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3",
|
||||||
"legendFormat": ".5-{{route}}",
|
"legendFormat": ".5-{{route}}",
|
||||||
"refId": "C"
|
"refId": "C"
|
||||||
}
|
}
|
||||||
@@ -1502,7 +1501,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[5m])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[5m])) by (route)",
|
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route) > 0",
|
||||||
"interval": "",
|
"interval": "",
|
||||||
"legendFormat": "{{route}}",
|
"legendFormat": "{{route}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
@@ -2049,7 +2048,7 @@
|
|||||||
"panels": [ ],
|
"panels": [ ],
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"} | logfmt | level=\"error\"",
|
"expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"} |= \"level=error\"",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -2100,7 +2099,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\"}[5m])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[5m])) by (route)",
|
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[$__rate_interval])) by (route) > 0",
|
||||||
"interval": "",
|
"interval": "",
|
||||||
"intervalFactor": 1,
|
"intervalFactor": 1,
|
||||||
"legendFormat": "{{route}}",
|
"legendFormat": "{{route}}",
|
||||||
@@ -2190,9 +2189,9 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (ingester)",
|
"expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)",
|
||||||
"intervalFactor": 1,
|
"intervalFactor": 1,
|
||||||
"legendFormat": "{{ingester}}",
|
"legendFormat": "{{pod}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -3256,7 +3255,7 @@
|
|||||||
"panels": [ ],
|
"panels": [ ],
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"} | logfmt | level=\"error\"",
|
"expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"} |= \"level=error\"",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -3307,7 +3306,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\", status_code!~\"5[0-9]{2}\"}[1m])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"}[1m])) by (route)",
|
"expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"}[$__rate_interval])) by (route) > 0",
|
||||||
"interval": "",
|
"interval": "",
|
||||||
"intervalFactor": 1,
|
"intervalFactor": 1,
|
||||||
"legendFormat": "{{route}}",
|
"legendFormat": "{{route}}",
|
||||||
|
@@ -53,6 +53,11 @@
|
|||||||
"points": false,
|
"points": false,
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [
|
"seriesOverrides": [
|
||||||
|
{
|
||||||
|
"alias": "request",
|
||||||
|
"color": "#FFC000",
|
||||||
|
"fill": 0
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"alias": "limit",
|
"alias": "limit",
|
||||||
"color": "#E02F44",
|
"color": "#E02F44",
|
||||||
@@ -72,6 +77,14 @@
|
|||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\", resource=\"cpu\"} > 0)",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "request",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"})",
|
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"})",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
@@ -142,6 +155,11 @@
|
|||||||
"points": false,
|
"points": false,
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [
|
"seriesOverrides": [
|
||||||
|
{
|
||||||
|
"alias": "request",
|
||||||
|
"color": "#FFC000",
|
||||||
|
"fill": 0
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"alias": "limit",
|
"alias": "limit",
|
||||||
"color": "#E02F44",
|
"color": "#E02F44",
|
||||||
@@ -161,6 +179,14 @@
|
|||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\", resource=\"memory\"} > 0)",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "request",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"} > 0)",
|
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"} > 0)",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
@@ -509,83 +535,6 @@
|
|||||||
"show": false
|
"show": false
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"aliasColors": { },
|
|
||||||
"bars": false,
|
|
||||||
"dashLength": 10,
|
|
||||||
"dashes": false,
|
|
||||||
"datasource": "$datasource",
|
|
||||||
"fill": 1,
|
|
||||||
"gridPos": { },
|
|
||||||
"id": 7,
|
|
||||||
"legend": {
|
|
||||||
"avg": false,
|
|
||||||
"current": false,
|
|
||||||
"max": false,
|
|
||||||
"min": false,
|
|
||||||
"show": true,
|
|
||||||
"total": false,
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"lines": true,
|
|
||||||
"linewidth": 1,
|
|
||||||
"links": [ ],
|
|
||||||
"nullPointMode": "null as zero",
|
|
||||||
"percentage": false,
|
|
||||||
"pointradius": 5,
|
|
||||||
"points": false,
|
|
||||||
"renderer": "flot",
|
|
||||||
"seriesOverrides": [ ],
|
|
||||||
"spaceLength": 10,
|
|
||||||
"span": 6,
|
|
||||||
"stack": false,
|
|
||||||
"steppedLine": false,
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "loki_boltdb_shipper_query_readiness_duration_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"}",
|
|
||||||
"format": "time_series",
|
|
||||||
"intervalFactor": 2,
|
|
||||||
"legendFormat": "duration",
|
|
||||||
"legendLink": null,
|
|
||||||
"step": 10
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"thresholds": [ ],
|
|
||||||
"timeFrom": null,
|
|
||||||
"timeShift": null,
|
|
||||||
"title": "Query Readiness Duration",
|
|
||||||
"tooltip": {
|
|
||||||
"shared": true,
|
|
||||||
"sort": 2,
|
|
||||||
"value_type": "individual"
|
|
||||||
},
|
|
||||||
"type": "graph",
|
|
||||||
"xaxis": {
|
|
||||||
"buckets": null,
|
|
||||||
"mode": "time",
|
|
||||||
"name": null,
|
|
||||||
"show": true,
|
|
||||||
"values": [ ]
|
|
||||||
},
|
|
||||||
"yaxes": [
|
|
||||||
{
|
|
||||||
"format": "s",
|
|
||||||
"label": null,
|
|
||||||
"logBase": 1,
|
|
||||||
"max": null,
|
|
||||||
"min": 0,
|
|
||||||
"show": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"format": "short",
|
|
||||||
"label": null,
|
|
||||||
"logBase": 1,
|
|
||||||
"max": null,
|
|
||||||
"min": null,
|
|
||||||
"show": false
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
@@ -607,7 +556,7 @@
|
|||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"id": 8,
|
"id": 7,
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -626,6 +575,11 @@
|
|||||||
"points": false,
|
"points": false,
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [
|
"seriesOverrides": [
|
||||||
|
{
|
||||||
|
"alias": "request",
|
||||||
|
"color": "#FFC000",
|
||||||
|
"fill": 0
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"alias": "limit",
|
"alias": "limit",
|
||||||
"color": "#E02F44",
|
"color": "#E02F44",
|
||||||
@@ -645,6 +599,14 @@
|
|||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\", resource=\"cpu\"} > 0)",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "request",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"})",
|
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"})",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
@@ -695,7 +657,7 @@
|
|||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"id": 9,
|
"id": 8,
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
@@ -714,6 +676,11 @@
|
|||||||
"points": false,
|
"points": false,
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [
|
"seriesOverrides": [
|
||||||
|
{
|
||||||
|
"alias": "request",
|
||||||
|
"color": "#FFC000",
|
||||||
|
"fill": 0
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"alias": "limit",
|
"alias": "limit",
|
||||||
"color": "#E02F44",
|
"color": "#E02F44",
|
||||||
@@ -733,6 +700,14 @@
|
|||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\", resource=\"memory\"} > 0)",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "request",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"} > 0)",
|
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"} > 0)",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
@@ -783,7 +758,7 @@
|
|||||||
"dashes": false,
|
"dashes": false,
|
||||||
"datasource": "$datasource",
|
"datasource": "$datasource",
|
||||||
"fill": 1,
|
"fill": 1,
|
||||||
"id": 10,
|
"id": 9,
|
||||||
"legend": {
|
"legend": {
|
||||||
"avg": false,
|
"avg": false,
|
||||||
"current": false,
|
"current": false,
|
||||||
|
@@ -142,7 +142,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "{{ route }} 99th Percentile",
|
"legendFormat": "{{ route }} 99th Percentile",
|
||||||
@@ -150,7 +150,7 @@
|
|||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.50, sum by (le,route) (job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "{{ route }} 50th Percentile",
|
"legendFormat": "{{ route }} 50th Percentile",
|
||||||
@@ -158,7 +158,7 @@
|
|||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "1e3 * sum(job_route:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}) by (route) / sum(job_route:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}) by (route) ",
|
"expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "{{ route }} Average",
|
"legendFormat": "{{ route }} Average",
|
||||||
|
@@ -52,6 +52,11 @@
|
|||||||
"points": false,
|
"points": false,
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [
|
"seriesOverrides": [
|
||||||
|
{
|
||||||
|
"alias": "request",
|
||||||
|
"color": "#FFC000",
|
||||||
|
"fill": 0
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"alias": "limit",
|
"alias": "limit",
|
||||||
"color": "#E02F44",
|
"color": "#E02F44",
|
||||||
@@ -71,6 +76,14 @@
|
|||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\", resource=\"cpu\"} > 0)",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "request",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"})",
|
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"})",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
@@ -140,6 +153,11 @@
|
|||||||
"points": false,
|
"points": false,
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [
|
"seriesOverrides": [
|
||||||
|
{
|
||||||
|
"alias": "request",
|
||||||
|
"color": "#FFC000",
|
||||||
|
"fill": 0
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"alias": "limit",
|
"alias": "limit",
|
||||||
"color": "#E02F44",
|
"color": "#E02F44",
|
||||||
@@ -159,6 +177,14 @@
|
|||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\", resource=\"memory\"} > 0)",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "request",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"} > 0)",
|
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-read.*\"} > 0)",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
|
@@ -128,6 +128,11 @@
|
|||||||
"points": false,
|
"points": false,
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [
|
"seriesOverrides": [
|
||||||
|
{
|
||||||
|
"alias": "request",
|
||||||
|
"color": "#FFC000",
|
||||||
|
"fill": 0
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"alias": "limit",
|
"alias": "limit",
|
||||||
"color": "#E02F44",
|
"color": "#E02F44",
|
||||||
@@ -147,6 +152,14 @@
|
|||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\", resource=\"cpu\"} > 0)",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "request",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"})",
|
"expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"})",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
@@ -217,6 +230,11 @@
|
|||||||
"points": false,
|
"points": false,
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
"seriesOverrides": [
|
"seriesOverrides": [
|
||||||
|
{
|
||||||
|
"alias": "request",
|
||||||
|
"color": "#FFC000",
|
||||||
|
"fill": 0
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"alias": "limit",
|
"alias": "limit",
|
||||||
"color": "#E02F44",
|
"color": "#E02F44",
|
||||||
@@ -236,6 +254,14 @@
|
|||||||
"legendLink": null,
|
"legendLink": null,
|
||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\", resource=\"memory\"} > 0)",
|
||||||
|
"format": "time_series",
|
||||||
|
"intervalFactor": 2,
|
||||||
|
"legendFormat": "request",
|
||||||
|
"legendLink": null,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"} > 0)",
|
"expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki|enterprise-logs)-write.*\"} > 0)",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
|
@@ -142,7 +142,7 @@
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.99, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.99, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})) * 1e3",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "99th Percentile",
|
"legendFormat": "99th Percentile",
|
||||||
@@ -150,7 +150,7 @@
|
|||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "histogram_quantile(0.50, sum by (le) (job:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})) * 1e3",
|
"expr": "histogram_quantile(0.50, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})) * 1e3",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "50th Percentile",
|
"legendFormat": "50th Percentile",
|
||||||
@@ -158,7 +158,7 @@
|
|||||||
"step": 10
|
"step": 10
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "1e3 * sum(job:loki_request_duration_seconds_sum:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"}) / sum(job:loki_request_duration_seconds_count:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", cluster=~\"$cluster\"})",
|
"expr": "1e3 * sum(cluster_job:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}) / sum(cluster_job:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "Average",
|
"legendFormat": "Average",
|
||||||
|
7010
charts/meta-monitoring/src/dashboards/tempo-operational.json
Normal file
7010
charts/meta-monitoring/src/dashboards/tempo-operational.json
Normal file
File diff suppressed because it is too large
Load Diff
1612
charts/meta-monitoring/src/dashboards/tempo-reads.json
Normal file
1612
charts/meta-monitoring/src/dashboards/tempo-reads.json
Normal file
File diff suppressed because it is too large
Load Diff
2431
charts/meta-monitoring/src/dashboards/tempo-resources.json
Normal file
2431
charts/meta-monitoring/src/dashboards/tempo-resources.json
Normal file
File diff suppressed because it is too large
Load Diff
1559
charts/meta-monitoring/src/dashboards/tempo-rollout-progress.json
Normal file
1559
charts/meta-monitoring/src/dashboards/tempo-rollout-progress.json
Normal file
File diff suppressed because it is too large
Load Diff
1181
charts/meta-monitoring/src/dashboards/tempo-tenants.json
Normal file
1181
charts/meta-monitoring/src/dashboards/tempo-tenants.json
Normal file
File diff suppressed because it is too large
Load Diff
1738
charts/meta-monitoring/src/dashboards/tempo-writes.json
Normal file
1738
charts/meta-monitoring/src/dashboards/tempo-writes.json
Normal file
File diff suppressed because it is too large
Load Diff
53
charts/meta-monitoring/src/rules/loki-rules.yaml
Normal file
53
charts/meta-monitoring/src/rules/loki-rules.yaml
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
groups:
|
||||||
|
- name: loki_rules
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||||
|
route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
571
charts/meta-monitoring/src/rules/mimir-rules.yaml
Normal file
571
charts/meta-monitoring/src/rules/mimir-rules.yaml
Normal file
@@ -0,0 +1,571 @@
|
|||||||
|
groups:
|
||||||
|
- name: mimir_api_1
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:cortex_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_api_2
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||||
|
route)
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_api_3
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_querier_api
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
|
||||||
|
(cluster, job, route)
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, route)
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, route)
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, route)
|
||||||
|
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_cache
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job, method)
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
method)
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
|
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_storage
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_queries
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_query_frontend_retries:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_query_frontend_retries:50quantile
|
||||||
|
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_retries:avg
|
||||||
|
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
|
||||||
|
(cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
|
||||||
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
|
||||||
|
cluster, job)
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
|
||||||
|
- name: mimir_ingester_queries
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_series:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_series:50quantile
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_series:avg
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples:50quantile
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples:avg
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars:50quantile
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) /
|
||||||
|
sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars:avg
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate
|
||||||
|
- expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate
|
||||||
|
- name: mimir_received_samples
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
||||||
|
- name: mimir_exemplars_in
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m
|
||||||
|
- name: mimir_received_exemplars
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m
|
||||||
|
- name: mimir_exemplars_ingested
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m
|
||||||
|
- name: mimir_exemplars_appended
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))
|
||||||
|
record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m
|
||||||
|
- name: mimir_scaling_rules
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the number of replicas for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
kube_deployment_spec_replicas,
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:actual_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by (cluster, namespace) (
|
||||||
|
cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
/ 240000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: distributor
|
||||||
|
reason: sample_rate
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||||
|
* 0.59999999999999998 / 240000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: distributor
|
||||||
|
reason: sample_rate_limits
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by (cluster, namespace) (
|
||||||
|
cluster_namespace_job:cortex_distributor_received_samples:rate5m
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
* 3 / 80000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: ingester
|
||||||
|
reason: sample_rate
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
quantile_over_time(0.99,
|
||||||
|
sum by(cluster, namespace) (
|
||||||
|
cortex_ingester_memory_series
|
||||||
|
)[24h:]
|
||||||
|
)
|
||||||
|
/ 1500000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: ingester
|
||||||
|
reason: active_series
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
|
||||||
|
* 3 * 0.59999999999999998 / 1500000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: ingester
|
||||||
|
reason: active_series_limits
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
|
||||||
|
* 0.59999999999999998 / 80000
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: ingester
|
||||||
|
reason: sample_rate_limits
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
ceil(
|
||||||
|
(sum by (cluster, namespace) (
|
||||||
|
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
|
||||||
|
) / 4)
|
||||||
|
/
|
||||||
|
avg by (cluster, namespace) (
|
||||||
|
memcached_limit_bytes{job=~".+/memcached"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
deployment: memcached
|
||||||
|
reason: active_series
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])),
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the CPU request for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
||||||
|
# that remove resource metrics, ref:
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
|
||||||
|
#
|
||||||
|
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
|
||||||
|
# where kube_pod_container_resource_requests_cpu_cores was removed:
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests_cpu_cores,
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
# This expression is compatible with kube-state-metrics >= v1.4.0,
|
||||||
|
# where kube_pod_container_resource_requests was introduced.
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests{resource="cpu"},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
|
- expr: |
|
||||||
|
# Jobs should be sized to their CPU usage.
|
||||||
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||||
|
# their current provisioned #replicas and resource requests.
|
||||||
|
ceil(
|
||||||
|
cluster_namespace_deployment:actual_replicas:count
|
||||||
|
*
|
||||||
|
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
|
||||||
|
/
|
||||||
|
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
reason: cpu_usage
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the Memory utilization for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
container_memory_usage_bytes{image!=""},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:container_memory_usage_bytes:sum
|
||||||
|
- expr: |
|
||||||
|
# Convenience rule to get the Memory request for both a deployment and a statefulset.
|
||||||
|
# Multi-zone deployments are grouped together removing the "zone-X" suffix.
|
||||||
|
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
|
||||||
|
# that remove resource metrics, ref:
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
|
||||||
|
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
|
||||||
|
#
|
||||||
|
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
|
||||||
|
# where kube_pod_container_resource_requests_memory_bytes was removed:
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests_memory_bytes,
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
# This expression is compatible with kube-state-metrics >= v1.4.0,
|
||||||
|
# where kube_pod_container_resource_requests was introduced.
|
||||||
|
(
|
||||||
|
sum by (cluster, namespace, deployment) (
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_container_resource_requests{resource="memory"},
|
||||||
|
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
|
||||||
|
),
|
||||||
|
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
|
||||||
|
# always matches everything and the (optional) zone is not removed.
|
||||||
|
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
||||||
|
- expr: |
|
||||||
|
# Jobs should be sized to their Memory usage.
|
||||||
|
# We do this by comparing 99th percentile usage over the last 24hrs to
|
||||||
|
# their current provisioned #replicas and resource requests.
|
||||||
|
ceil(
|
||||||
|
cluster_namespace_deployment:actual_replicas:count
|
||||||
|
*
|
||||||
|
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
|
||||||
|
/
|
||||||
|
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
|
||||||
|
)
|
||||||
|
labels:
|
||||||
|
reason: memory_usage
|
||||||
|
record: cluster_namespace_deployment_reason:required_replicas:count
|
||||||
|
- name: mimir_alertmanager_rules
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job, pod) (cortex_alertmanager_alerts)
|
||||||
|
record: cluster_job_pod:cortex_alertmanager_alerts:sum
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job, pod) (cortex_alertmanager_silences)
|
||||||
|
record: cluster_job_pod:cortex_alertmanager_silences:sum
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
|
||||||
|
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
|
||||||
|
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
|
||||||
|
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m
|
||||||
|
- name: mimir_ingester_rules
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
|
||||||
|
record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
|
15
charts/meta-monitoring/src/rules/tempo-rules.yaml
Normal file
15
charts/meta-monitoring/src/rules/tempo-rules.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
groups:
|
||||||
|
- name: tempo_rules
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate
|
@@ -37,6 +37,8 @@ data:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Logs
|
||||||
|
|
||||||
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
{{- if or .Values.local.logs.enabled .Values.cloud.logs.enabled }}
|
||||||
loki.source.kubernetes "pods" {
|
loki.source.kubernetes "pods" {
|
||||||
targets = discovery.relabel.rename_meta_labels.output
|
targets = discovery.relabel.rename_meta_labels.output
|
||||||
@@ -58,13 +60,131 @@ data:
|
|||||||
{{- end }}
|
{{- end }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
// Metrics
|
||||||
|
|
||||||
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
{{- if or .Values.local.metrics.enabled .Values.cloud.metrics.enabled }}
|
||||||
prometheus.scrape "pods" {
|
prometheus.scrape "pods" {
|
||||||
targets = discovery.relabel.rename_meta_labels.output
|
targets = discovery.relabel.rename_meta_labels.output
|
||||||
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
}
|
}
|
||||||
|
{{- if .Values.kubeStateMetrics.enabled }}
|
||||||
|
|
||||||
|
prometheus.scrape "kubeStateMetrics" {
|
||||||
|
targets = [ { "__address__" = "{{ .Values.kubeStateMetrics.endpoint }}" } ]
|
||||||
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
|
}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
// cAdvisor and Kubelete metrics
|
||||||
|
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
|
||||||
|
discovery.kubernetes "all_nodes" {
|
||||||
|
role = "node"
|
||||||
|
}
|
||||||
|
|
||||||
|
discovery.relabel "all_nodes" {
|
||||||
|
targets = discovery.kubernetes.all_nodes.targets
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_node_name"]
|
||||||
|
target_label = "node"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||||
|
separator = "/"
|
||||||
|
regex = "(.*)/(.*)/(.*)"
|
||||||
|
replacement = "${1}/${2}-${3}"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
target_label = "cluster"
|
||||||
|
replacement = "{{- .Values.clusterName -}}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.scrape "cadvisor" {
|
||||||
|
targets = discovery.relabel.all_nodes.output
|
||||||
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
|
|
||||||
|
scrape_interval = "15s"
|
||||||
|
metrics_path = "/metrics/cadvisor"
|
||||||
|
scheme = "https"
|
||||||
|
|
||||||
|
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||||
|
tls_config {
|
||||||
|
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.scrape "kubelet" {
|
||||||
|
targets = discovery.relabel.all_nodes.output
|
||||||
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
|
|
||||||
|
scrape_interval = "15s"
|
||||||
|
metrics_path = "/metrics"
|
||||||
|
scheme = "https"
|
||||||
|
|
||||||
|
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||||
|
tls_config {
|
||||||
|
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.exporter.unix {}
|
||||||
|
|
||||||
|
prometheus.scrape "node_exporter" {
|
||||||
|
targets = prometheus.exporter.unix.targets
|
||||||
|
forward_to = [prometheus.relabel.node_exporter.receiver]
|
||||||
|
|
||||||
|
job_name = "node-exporter"
|
||||||
|
scrape_interval = "15s"
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.relabel "node_exporter" {
|
||||||
|
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
|
||||||
|
|
||||||
|
rule {
|
||||||
|
replacement = env("HOSTNAME")
|
||||||
|
target_label = "nodename"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
replacement = "node-exporter"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_node_name"]
|
||||||
|
target_label = "node"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
|
||||||
|
separator = "/"
|
||||||
|
regex = "(.*)/(.*)/(.*)"
|
||||||
|
replacement = "${1}/${2}-${3}"
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
target_label = "cluster"
|
||||||
|
replacement = "{{- .Values.clusterName -}}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
// Traces
|
||||||
|
|
||||||
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}
|
||||||
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
// Shamelessly copied from https://github.com/grafana/intro-to-mlt/blob/main/agent/config.river
|
||||||
otelcol.receiver.otlp "otlp_receiver" {
|
otelcol.receiver.otlp "otlp_receiver" {
|
||||||
|
@@ -1,16 +1,16 @@
|
|||||||
{{- if or (or .Values.local.logs.enabled .Values.local.metrics.enabled) .Values.local.traces.enabled }}
|
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
metadata:
|
metadata:
|
||||||
name: loki-dashboards-provisioning
|
name: dashboards-provisioning
|
||||||
namespace: {{ $.Release.Namespace }}
|
namespace: {{ $.Release.Namespace }}
|
||||||
data:
|
data:
|
||||||
dashboards.yaml: |
|
dashboards.yaml: |
|
||||||
---
|
---
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
providers:
|
providers:
|
||||||
{{- if .Values.local.logs.enabled }}
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
- disableDeletion: true
|
- disableDeletion: true
|
||||||
editable: false
|
editable: false
|
||||||
folder: Loki
|
folder: Loki
|
||||||
@@ -28,7 +28,7 @@ data:
|
|||||||
orgId: 1
|
orgId: 1
|
||||||
type: file
|
type: file
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
- disableDeletion: true
|
- disableDeletion: true
|
||||||
editable: false
|
editable: false
|
||||||
folder: Mimir
|
folder: Mimir
|
||||||
@@ -70,4 +70,14 @@ data:
|
|||||||
orgId: 1
|
orgId: 1
|
||||||
type: file
|
type: file
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.traces.enabled }}
|
||||||
|
- disableDeletion: true
|
||||||
|
editable: false
|
||||||
|
folder: Tempo
|
||||||
|
name: tempo-1
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards/tempo-1
|
||||||
|
orgId: 1
|
||||||
|
type: file
|
||||||
|
{{- end }}
|
||||||
{{- end }}
|
{{- end }}
|
@@ -65,15 +65,17 @@ spec:
|
|||||||
name: grafana-pv
|
name: grafana-pv
|
||||||
- mountPath: /etc/grafana/provisioning/datasources
|
- mountPath: /etc/grafana/provisioning/datasources
|
||||||
name: datasources-provisioning
|
name: datasources-provisioning
|
||||||
|
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||||
- mountPath: /etc/grafana/provisioning/dashboards
|
- mountPath: /etc/grafana/provisioning/dashboards
|
||||||
name: loki-dashboards-provisioning
|
name: dashboards-provisioning
|
||||||
{{- if .Values.local.logs.enabled }}
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
- mountPath: /var/lib/grafana/dashboards/loki-1
|
- mountPath: /var/lib/grafana/dashboards/loki-1
|
||||||
name: loki-dashboards-1
|
name: loki-dashboards-1
|
||||||
- mountPath: /var/lib/grafana/dashboards/loki-2
|
- mountPath: /var/lib/grafana/dashboards/loki-2
|
||||||
name: loki-dashboards-2
|
name: loki-dashboards-2
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
- mountPath: /var/lib/grafana/dashboards/mimir-1
|
- mountPath: /var/lib/grafana/dashboards/mimir-1
|
||||||
name: mimir-dashboards-1
|
name: mimir-dashboards-1
|
||||||
- mountPath: /var/lib/grafana/dashboards/mimir-2
|
- mountPath: /var/lib/grafana/dashboards/mimir-2
|
||||||
@@ -85,6 +87,10 @@ spec:
|
|||||||
- mountPath: /var/lib/grafana/dashboards/mimir-5
|
- mountPath: /var/lib/grafana/dashboards/mimir-5
|
||||||
name: mimir-dashboards-5
|
name: mimir-dashboards-5
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.traces.enabled }}
|
||||||
|
- mountPath: /var/lib/grafana/dashboards/tempo-1
|
||||||
|
name: tempo-dashboards-1
|
||||||
|
{{- end }}
|
||||||
volumes:
|
volumes:
|
||||||
- name: grafana-pv
|
- name: grafana-pv
|
||||||
persistentVolumeClaim:
|
persistentVolumeClaim:
|
||||||
@@ -92,10 +98,10 @@ spec:
|
|||||||
- name: datasources-provisioning
|
- name: datasources-provisioning
|
||||||
configMap:
|
configMap:
|
||||||
name: datasources-provisioning
|
name: datasources-provisioning
|
||||||
{{- if .Values.local.logs.enabled }}
|
- name: dashboards-provisioning
|
||||||
- name: loki-dashboards-provisioning
|
|
||||||
configMap:
|
configMap:
|
||||||
name: loki-dashboards-provisioning
|
name: dashboards-provisioning
|
||||||
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
- name: loki-dashboards-1
|
- name: loki-dashboards-1
|
||||||
configMap:
|
configMap:
|
||||||
name: loki-dashboards-1
|
name: loki-dashboards-1
|
||||||
@@ -103,10 +109,7 @@ spec:
|
|||||||
configMap:
|
configMap:
|
||||||
name: loki-dashboards-2
|
name: loki-dashboards-2
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
- name: mimir-dashboards-provisioning
|
|
||||||
configMap:
|
|
||||||
name: mimir-dashboards-provisioning
|
|
||||||
- name: mimir-dashboards-1
|
- name: mimir-dashboards-1
|
||||||
configMap:
|
configMap:
|
||||||
name: mimir-dashboards-1
|
name: mimir-dashboards-1
|
||||||
@@ -123,6 +126,11 @@ spec:
|
|||||||
configMap:
|
configMap:
|
||||||
name: mimir-dashboards-5
|
name: mimir-dashboards-5
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.traces.enabled }}
|
||||||
|
- name: tempo-dashboards-1
|
||||||
|
configMap:
|
||||||
|
name: tempo-dashboards-1
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.logs.enabled }}
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.logs.enabled }}
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -1,4 +1,4 @@
|
|||||||
{{- if .Values.local.metrics.enabled }}
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
@@ -0,0 +1,21 @@
|
|||||||
|
{{- if .Values.dashboards.traces.enabled }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: tempo-dashboards-1
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
data:
|
||||||
|
"tempo-operational.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/tempo-operational.json" | fromJson | toJson }}
|
||||||
|
"tempo-reads.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/tempo-reads.json" | fromJson | toJson }}
|
||||||
|
"tempo-resources.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/tempo-resources.json" | fromJson | toJson }}
|
||||||
|
"tempo-rollout-progress.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/tempo-rollout-progress.json" | fromJson | toJson }}
|
||||||
|
"tempo-tenants.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/tempo-tenants.json" | fromJson | toJson }}
|
||||||
|
"tempo-writes.json": |
|
||||||
|
{{ $.Files.Get "src/dashboards/tempo-writes.json" | fromJson | toJson }}
|
||||||
|
{{- end }}
|
126
charts/meta-monitoring/templates/ruler/ruler.yaml
Normal file
126
charts/meta-monitoring/templates/ruler/ruler.yaml
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: meta-mimir-ruler-for-dashboards
|
||||||
|
namespace: meta
|
||||||
|
spec:
|
||||||
|
progressDeadlineSeconds: 600
|
||||||
|
replicas: 1
|
||||||
|
revisionHistoryLimit: 10
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: ruler-for-dashboards
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
strategy:
|
||||||
|
rollingUpdate:
|
||||||
|
maxSurge: 50%
|
||||||
|
maxUnavailable: 0
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: ruler-for-dashboards
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
namespace: meta
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- args:
|
||||||
|
- -target=ruler
|
||||||
|
- -log.level=debug
|
||||||
|
- -ruler-storage.backend=local
|
||||||
|
- -ruler-storage.local.directory=/etc/rules
|
||||||
|
- -ruler.ring.prefix=dashboards/
|
||||||
|
- -config.expand-env=true
|
||||||
|
- -config.file=/etc/mimir/mimir.yaml
|
||||||
|
image: grafana/mimir:2.8.0
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
name: ruler
|
||||||
|
ports:
|
||||||
|
- containerPort: 8080
|
||||||
|
name: http-metrics
|
||||||
|
protocol: TCP
|
||||||
|
- containerPort: 9095
|
||||||
|
name: grpc
|
||||||
|
protocol: TCP
|
||||||
|
- containerPort: 7946
|
||||||
|
name: memberlist
|
||||||
|
protocol: TCP
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: http-metrics
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 45
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 1
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 128Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
terminationMessagePath: /dev/termination-log
|
||||||
|
terminationMessagePolicy: File
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /etc/mimir
|
||||||
|
name: config
|
||||||
|
- mountPath: /var/mimir
|
||||||
|
name: runtime-config
|
||||||
|
- mountPath: /data
|
||||||
|
name: storage
|
||||||
|
- mountPath: /active-query-tracker
|
||||||
|
name: active-queries
|
||||||
|
- mountPath: /etc/rules/anonymous
|
||||||
|
name: rules
|
||||||
|
dnsPolicy: ClusterFirst
|
||||||
|
restartPolicy: Always
|
||||||
|
schedulerName: default-scheduler
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 10001
|
||||||
|
runAsGroup: 10001
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 10001
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
serviceAccount: meta-mimir
|
||||||
|
serviceAccountName: meta-mimir
|
||||||
|
terminationGracePeriodSeconds: 180
|
||||||
|
topologySpreadConstraints:
|
||||||
|
- labelSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: ruler
|
||||||
|
app.kubernetes.io/instance: meta
|
||||||
|
app.kubernetes.io/name: mimir
|
||||||
|
maxSkew: 1
|
||||||
|
topologyKey: kubernetes.io/hostname
|
||||||
|
whenUnsatisfiable: ScheduleAnyway
|
||||||
|
volumes:
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
items:
|
||||||
|
- key: mimir.yaml
|
||||||
|
path: mimir.yaml
|
||||||
|
name: meta-mimir-config
|
||||||
|
name: config
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
name: meta-mimir-runtime
|
||||||
|
name: runtime-config
|
||||||
|
- emptyDir: {}
|
||||||
|
name: storage
|
||||||
|
- emptyDir: {}
|
||||||
|
name: active-queries
|
||||||
|
- configMap:
|
||||||
|
defaultMode: 420
|
||||||
|
name: rules
|
||||||
|
name: rules
|
||||||
|
{{- end }}
|
18
charts/meta-monitoring/templates/ruler/rules-configmap.yaml
Normal file
18
charts/meta-monitoring/templates/ruler/rules-configmap.yaml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
{{- if or (or .Values.dashboards.logs.enabled .Values.dashboards.metrics.enabled) .Values.dashboards.traces.enabled }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: rules
|
||||||
|
namespace: {{ $.Release.Namespace }}
|
||||||
|
data:
|
||||||
|
{{- if .Values.dashboards.logs.enabled }}
|
||||||
|
{{ ($.Files.Glob "src/rules/loki-rules.yaml").AsConfig | indent 2 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.metrics.enabled }}
|
||||||
|
{{ ($.Files.Glob "src/rules/mimir-rules.yaml").AsConfig | indent 2 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .Values.dashboards.traces.enabled }}
|
||||||
|
{{ ($.Files.Glob "src/rules/tempo-rules.yaml").AsConfig | indent 2 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
@@ -14,7 +14,6 @@ local:
|
|||||||
minio:
|
minio:
|
||||||
enabled: false # This should be set to true if any of the previous is enabled
|
enabled: false # This should be set to true if any of the previous is enabled
|
||||||
|
|
||||||
|
|
||||||
cloud:
|
cloud:
|
||||||
logs:
|
logs:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -41,11 +40,28 @@ logs:
|
|||||||
# source: "" # Empty uses the log message
|
# source: "" # Empty uses the log message
|
||||||
# replace: "*****""
|
# replace: "*****""
|
||||||
|
|
||||||
|
# Set enabled = true to add the default logs/metrics/traces dashboards to the local Grafana
|
||||||
|
dashboards:
|
||||||
|
logs:
|
||||||
|
enabled: true
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
traces:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
global:
|
global:
|
||||||
minio:
|
minio:
|
||||||
rootUser: "rootuser"
|
rootUser: "rootuser"
|
||||||
rootPassword: "rootpassword"
|
rootPassword: "rootpassword"
|
||||||
|
|
||||||
|
kubeStateMetrics:
|
||||||
|
# Scrape https://github.com/kubernetes/kube-state-metrics by default
|
||||||
|
enabled: true
|
||||||
|
# This endpoint is created when the helm chart from
|
||||||
|
# https://artifacthub.io/packages/helm/prometheus-community/kube-state-metrics/
|
||||||
|
# is used. Change this if kube-state-metrics is installed somewhere else.
|
||||||
|
endpoint: kube-state-metrics.kube-state-metrics.svc.cluster.local:8080
|
||||||
|
|
||||||
# The following are configuration for the dependencies.
|
# The following are configuration for the dependencies.
|
||||||
# These should not be changed.
|
# These should not be changed.
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user