Compare commits

..

No commits in common. "main" and "fix_dashboards" have entirely different histories.

22 changed files with 180 additions and 111 deletions

View File

@ -1,19 +0,0 @@
apiVersion: kind.x-k8s.io/v1alpha4
kind: Cluster
nodes:
- role: control-plane
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
controllerManager:
extraArgs:
bind-address: 0.0.0.0
secure-port: "10257"
scheduler:
extraArgs:
bind-address: 0.0.0.0
secure-port: "10259"
- |
kind: KubeProxyConfiguration
metricsBindAddress: 0.0.0.0:10249
- role: worker

View File

@ -19,9 +19,6 @@ jobs:
updateVersions: updateVersions:
name: Update the subcharts name: Update the subcharts
runs-on: "ubuntu-latest" runs-on: "ubuntu-latest"
permissions:
contents: write
id-token: write
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v2 uses: actions/checkout@v2
@ -69,20 +66,6 @@ jobs:
echo "changed=true" >> "${GITHUB_OUTPUT}" echo "changed=true" >> "${GITHUB_OUTPUT}"
fi fi
- id: get-secrets
uses: grafana/shared-workflows/actions/get-vault-secrets@main
with:
# Secrets placed in the ci/repo/grafana/<repo>/<path> path in Vault
repo_secrets: |
APP_ID=github-app:app-id
PRIVATE_KEY=github-app:private-key
- uses: actions/create-github-app-token@v1
id: app-token
with:
app-id: ${{ env.APP_ID }}
private-key: ${{ env.PRIVATE_KEY }}
- name: Create pull request - name: Create pull request
if: steps.update-loki.outputs.changed == 'true' || steps.update-grafana-alloy.outputs.changed == 'true' || steps.update-mimir-distributed.outputs.changed == 'true' || steps.update-tempo-distributed.outputs.changed == 'true' || steps.update-minio.outputs.changed == 'true' if: steps.update-loki.outputs.changed == 'true' || steps.update-grafana-alloy.outputs.changed == 'true' || steps.update-mimir-distributed.outputs.changed == 'true' || steps.update-tempo-distributed.outputs.changed == 'true' || steps.update-minio.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5 uses: peter-evans/create-pull-request@v5
@ -96,15 +79,10 @@ jobs:
labels: dependencies labels: dependencies
branch: chore/update-dependencies branch: chore/update-dependencies
delete-branch: true delete-branch: true
team-reviewers: "@grafana/loki-squad"
token: ${{ steps.app-token.outputs.token }}
updateGrafana: updateGrafana:
name: Update the Grafana version name: Update the Grafana version
runs-on: "ubuntu-latest" runs-on: "ubuntu-latest"
permissions:
contents: write
id-token: write
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v2 uses: actions/checkout@v2
@ -120,20 +98,6 @@ jobs:
echo "changed=true" >> "${GITHUB_OUTPUT}" echo "changed=true" >> "${GITHUB_OUTPUT}"
fi fi
- id: get-secrets
uses: grafana/shared-workflows/actions/get-vault-secrets@main
with:
# Secrets placed in the ci/repo/grafana/<repo>/<path> path in Vault
repo_secrets: |
APP_ID=github-app:app-id
PRIVATE_KEY=github-app:private-key
- uses: actions/create-github-app-token@v1
id: app-token
with:
app-id: ${{ env.APP_ID }}
private-key: ${{ env.PRIVATE_KEY }}
- name: Create pull request - name: Create pull request
if: steps.update-grafana.outputs.changed == 'true' if: steps.update-grafana.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5 uses: peter-evans/create-pull-request@v5
@ -147,5 +111,3 @@ jobs:
labels: dependencies labels: dependencies
branch: chore/update-minio branch: chore/update-minio
delete-branch: true delete-branch: true
team-reviewers: "@grafana/loki-squad"
token: ${{ steps.app-token.outputs.token }}

View File

@ -1,7 +1,6 @@
--- ---
name: helm-ci name: helm-ci
on: on:
workflow_dispatch:
pull_request: pull_request:
paths: paths:
- "charts/meta-monitoring/**" - "charts/meta-monitoring/**"
@ -25,7 +24,7 @@ jobs:
# runs-on: ubuntu-latest # runs-on: ubuntu-latest
# steps: # steps:
# - name: Checkout # - name: Checkout
# uses: actions/checkout@v4 # uses: actions/checkout@v3
# with: # with:
# fetch-depth: 0 # fetch-depth: 0
@ -39,10 +38,10 @@ jobs:
# - name: Set up Python # - name: Set up Python
# uses: actions/setup-python@v4 # uses: actions/setup-python@v4
# with: # with:
# python-version: 3.9 # python-version: 3.7
# - name: Set up chart-testing # - name: Set up chart-testing
# uses: helm/chart-testing-action@v2 # uses: helm/chart-testing-action@v2.4.0
# - name: Run chart-testing (list-changed) # - name: Run chart-testing (list-changed)
# id: list-changed # id: list-changed
@ -56,10 +55,10 @@ jobs:
# run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false # run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false
# - name: Create kind cluster # - name: Create kind cluster
# uses: helm/kind-action@v1 # uses: helm/kind-action@v1.8.0
# if: steps.list-changed.outputs.changed == 'true' # if: steps.list-changed.outputs.changed == 'true'
# with: # with:
# config: "${{ github.workspace }}/.github/configs/cluster-config.yaml" # config: tools/kind.config
# - name: Run chart-testing (install) # - name: Run chart-testing (install)
# run: | # run: |

View File

@ -1,6 +1,8 @@
# meta-monitoring-chart # meta-monitoring-chart
This is a meta-monitoring chart for Loki, specifically Loki installed via the Loki helm chart. This is a meta-monitoring chart for Loki.
Note that this is pre-production software at the moment.
## Local and cloud modes ## Local and cloud modes
@ -9,15 +11,19 @@ to small Loki, Mimir and Tempo installations running in the meta-monitoring name
![local mode](docs/images/Meta%20monitoring%20local.png) ![local mode](docs/images/Meta%20monitoring%20local.png)
To enable local mode set `local.<logs|metrics|traces>.enabled` to true.
In the cloud mode the logs, metrics and/or traces are sent to Grafana Cloud. In the cloud mode the logs, metrics and/or traces are sent to Grafana Cloud.
![cloud mode](docs/images/Meta%20monitoring%20cloud.png) ![cloud mode](docs/images/Meta%20monitoring%20cloud.png)
To enable cloud mode set `cloud.<logs|metrics|traces>.enabled` to true. The `endpoint`, `username` and `password` settings for your Grafana Cloud logs, metrics and traces instances have to be filled in as well.
Both modes can be enabled at the same time. Cloud mode is preferred. Both modes can be enabled at the same time. Cloud mode is preferred.
## Installation ## Installation
For more instructions including how to install the chart go to the [installation](docs/installation.md) page. For more instructions including how to update the chart go to the [installation](docs/installation.md) page.
## Supported features ## Supported features
@ -27,7 +33,8 @@ For more instructions including how to install the chart go to the [installation
- Specify PII regexes that are applied to logs before they are sent to Loki (cloud or local). The capture group in the regex is replaced with *****. - Specify PII regexes that are applied to logs before they are sent to Loki (cloud or local). The capture group in the regex is replaced with *****.
- a Grafana instance is installed (when local mode is used) with the relevant datasources installed. The following dashboards are installed: - a Grafana instance is installed (when local mode is used) with the relevant datasources installed. The following dashboards are installed:
- logs dashboards - logs dashboards
- Alloy dashboards - agent dashboards
- Retention is set to 24 hours
Most of these features are enabled by default. See the values.yaml file for how to enable/disable them. Most of these features are enabled by default. See the values.yaml file for how to enable/disable them.
@ -35,7 +42,8 @@ Most of these features are enabled by default. See the values.yaml file for how
- This has not been tested on Openshift yet. - This has not been tested on Openshift yet.
- The underlying Loki, Mimir and Tempo are at the default size installed by the Helm chart. This might need changing when monitoring bigger Loki, Mimir or Tempo installations. - The underlying Loki, Mimir and Tempo are at the default size installed by the Helm chart. This might need changing when monitoring bigger Loki, Mimir or Tempo installations.
- MinIO is used as storage for the local mode at the moment with a limited retention. At the moment this chart cannot be used for monitoring over longer periods. - MinIO is used as storage at the moment with a limited retention. At the moment this chart cannot be used for monitoring over longer periods.
- Agent self monitoring is not done at the moment.
## Developer help topics ## Developer help topics

View File

@ -1,18 +1,18 @@
dependencies: dependencies:
- name: loki - name: loki
repository: https://grafana.github.io/helm-charts repository: https://grafana.github.io/helm-charts
version: 6.29.0 version: 6.5.1
- name: alloy - name: alloy
repository: https://grafana.github.io/helm-charts repository: https://grafana.github.io/helm-charts
version: 0.12.5 version: 0.1.1
- name: mimir-distributed - name: mimir-distributed
repository: https://grafana.github.io/helm-charts repository: https://grafana.github.io/helm-charts
version: 5.6.0 version: 5.3.0
- name: tempo-distributed - name: tempo-distributed
repository: https://grafana.github.io/helm-charts repository: https://grafana.github.io/helm-charts
version: 1.33.0 version: 1.9.9
- name: minio - name: minio
repository: https://charts.min.io repository: https://charts.min.io
version: 5.4.0 version: 5.2.0
digest: sha256:5225a03d9003384639f5d43b1971126371269347f16f221b7aed377ab85d71be digest: sha256:e0c7af6d328fe35f4b9a3557235f458d92225b84b1366dbb77c4626d3cdb5be9
generated: "2025-03-27T07:03:11.17404081Z" generated: "2024-05-09T07:02:42.911579524Z"

View File

@ -13,7 +13,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes # This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version. # to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/) # Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.3.0 version: 0.0.3
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to # incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using. # follow Semantic Versioning. They should reflect the version the application is using.
@ -22,20 +22,20 @@ appVersion: "0.0.1"
dependencies: dependencies:
- name: loki - name: loki
repository: https://grafana.github.io/helm-charts repository: https://grafana.github.io/helm-charts
version: 6.29.0 version: 6.5.1
condition: local.logs.enabled condition: local.logs.enabled
- name: alloy - name: alloy
repository: https://grafana.github.io/helm-charts repository: https://grafana.github.io/helm-charts
version: 0.12.5 version: 0.1.1
- name: mimir-distributed - name: mimir-distributed
repository: https://grafana.github.io/helm-charts repository: https://grafana.github.io/helm-charts
version: 5.6.0 version: 5.3.0
condition: local.metrics.enabled condition: local.metrics.enabled
- name: tempo-distributed - name: tempo-distributed
repository: https://grafana.github.io/helm-charts repository: https://grafana.github.io/helm-charts
version: 1.33.0 version: 1.9.9
condition: local.traces.enabled condition: local.traces.enabled
- name: minio - name: minio
repository: https://charts.min.io repository: https://charts.min.io
version: 5.4.0 version: 5.2.0
condition: local.minio.enabled condition: local.minio.enabled

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -2449,7 +2449,7 @@
"repeatIteration": null, "repeatIteration": null,
"repeatRowId": null, "repeatRowId": null,
"showTitle": true, "showTitle": true,
"title": "TSDB Index", "title": "TSBD Index",
"titleSize": "h6" "titleSize": "h6"
}, },
{ {

View File

@ -120,9 +120,9 @@ data:
replacement = "{{- .Values.clusterLabelValue -}}" replacement = "{{- .Values.clusterLabelValue -}}"
} }
rule { rule {
source_labels = ["__meta_kubernetes_pod_container_port_name"] source_labels = ["__meta_kubernetes_pod_container_port_number"]
action = "keep" action = "drop"
regex = ".*metrics.*" regex = "9095"
} }
} }
@ -155,7 +155,133 @@ data:
forward_to = [ {{ include "agent.prometheus_write_targets" . }} ] forward_to = [ {{ include "agent.prometheus_write_targets" . }} ]
} }
{{- if .Values.kubeStateMetrics.enabled }}
prometheus.scrape "kubeStateMetrics" {
clustering {
enabled = true
}
targets = [ { "__address__" = "{{ .Values.kubeStateMetrics.endpoint }}" } ]
forward_to = [ prometheus.relabel.filter.receiver ]
}
{{- end }}
// cAdvisor and Kubelet metrics
// Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river
discovery.kubernetes "all_nodes" {
role = "node"
namespaces {
own_namespace = true
names = [ {{ include "agent.namespaces" . }} ]
}
}
discovery.relabel "all_nodes" {
targets = discovery.kubernetes.all_nodes.targets
rule {
source_labels = ["__meta_kubernetes_node_name"]
target_label = "node"
}
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
separator = "/"
regex = "(.*)/(.*)/(.*)"
replacement = "${1}/${2}-${3}"
target_label = "job"
}
rule {
target_label = "cluster"
replacement = "{{- .Values.clusterLabelValue -}}"
}
}
prometheus.scrape "cadvisor" {
clustering {
enabled = true
}
targets = discovery.relabel.all_nodes.output
forward_to = [ prometheus.relabel.filter.receiver ]
metrics_path = "/metrics/cadvisor"
scheme = "https"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
}
}
prometheus.scrape "kubelet" {
clustering {
enabled = true
}
targets = discovery.relabel.all_nodes.output
forward_to = [ prometheus.relabel.filter.receiver ]
metrics_path = "/metrics"
scheme = "https"
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
tls_config {
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
}
}
prometheus.exporter.unix "promexporter" {}
prometheus.scrape "node_exporter" {
clustering {
enabled = true
}
targets = prometheus.exporter.unix.promexporter.targets
forward_to = [prometheus.relabel.node_exporter.receiver]
job_name = "node-exporter"
}
prometheus.relabel "node_exporter" {
forward_to = [ prometheus.relabel.filter.receiver ]
rule {
replacement = env("HOSTNAME")
target_label = "nodename"
}
rule {
replacement = "node-exporter"
target_label = "job"
}
rule {
source_labels = ["__meta_kubernetes_node_name"]
target_label = "node"
}
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app_kubernetes_io_component"]
separator = "/"
regex = "(.*)/(.*)/(.*)"
replacement = "${1}/${2}-${3}"
target_label = "job"
}
rule {
target_label = "cluster"
replacement = "{{- .Values.clusterLabelValue -}}"
}
}
{{- end }} {{- end }}
{{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }} {{- if or .Values.local.traces.enabled .Values.cloud.traces.enabled }}

View File

@ -34,6 +34,10 @@
{{- end -}} {{- end -}}
{{- end -}} {{- end -}}
{{- if empty .Values.namespacesToMonitor -}}
{{- fail "No namespaces have been specified in namespacesToMonitor" -}}
{{- end -}}
{{- if empty .Values.metrics.retain -}} {{- if empty .Values.metrics.retain -}}
{{- fail "All metrics will be collected, please specify some in metrics.retain" -}} {{- fail "All metrics will be collected, please specify some in metrics.retain" -}}
{{- end -}} {{- end -}}

View File

@ -1,9 +1,8 @@
# Specify the namespaces to monitor here # Specify the namespaces to monitor here
# By default the chart will monitor the namespace it is installed in namespacesToMonitor:
# namespacesToMonitor: - loki
# - loki
# The name of the cluster where this will be installed # The name of the cluster where this will be installed
clusterLabelValue: "meta" clusterLabelValue: "meta-monitoring"
# Set to true to write logs, metrics or traces to Grafana Cloud # Set to true to write logs, metrics or traces to Grafana Cloud
# The secrets have to be created first # The secrets have to be created first
cloud: cloud:
@ -29,7 +28,7 @@ local:
minio: minio:
enabled: false # This should be set to true if any of the previous is enabled enabled: false # This should be set to true if any of the previous is enabled
grafana: grafana:
version: 11.4.3 version: 10.4.2
# Gateway ingress configuration # Gateway ingress configuration
ingress: ingress:
# -- Specifies whether an ingress for the gateway should be created # -- Specifies whether an ingress for the gateway should be created
@ -67,11 +66,9 @@ logs:
# The lines matching these will be kept in Loki # The lines matching these will be kept in Loki
retain: retain:
# This shows the queries # This shows the queries
- executing query
- caller=metrics.go - caller=metrics.go
# This shows any errors # This shows any errors
- level=error - level=error
- level=warn
# Log lines for delete requests # Log lines for delete requests
- delete request for user added - delete request for user added
- Started processing delete request - Started processing delete request

View File

@ -1,12 +1,8 @@
# Update the dependencies # Update the dependencies
The dependencies are the versions of Loki, Mimir, Agent and so on that are included in this chart. The dependencies are the version of Loki, Mimir, Agent and so on that are included in this chart.
The current versions can be found in the [Chart.yaml](../charts/meta-monitoring/Chart.yaml) file. The current versions can be found in the [Chart.yaml](../charts/meta-monitoring/Chart.yaml) file.
A Github action runs daily to see if updated versions are available. A PR will be created.
The manual steps are as follows:
Run this in the charts/meta-monitoring directory after updating a dependency: Run this in the charts/meta-monitoring directory after updating a dependency:
``` ```

View File

@ -4,7 +4,7 @@
1. Use an existing Grafana Cloud account or setup a new one. Then create an access token: 1. Use an existing Grafana Cloud account or setup a new one. Then create an access token:
1. In a Grafana instance on Grafana Cloud go to Administration -> Users and Access -> Cloud access policies. 1. In Grafana go to Administration -> Users and Access -> Cloud access policies.
1. Click `Create access policy`. 1. Click `Create access policy`.
@ -39,7 +39,7 @@
--from-literal=endpoint='https://otlp-gateway-prod-us-east-0.grafana.net/otlp' --from-literal=endpoint='https://otlp-gateway-prod-us-east-0.grafana.net/otlp'
``` ```
The logs, metrics and traces usernames are the `User / Username / Instance IDs` of the Loki, Prometheus/Mimir and OpenTelemetry instances in Grafana Cloud. From `Home` in Grafana click on `Stacks`. Then go to the `Details` pages of Loki and Prometheus/Mimir. For OpenTelemetry go to the `Configure` page. The endpoints will also have to be changed to match your settings. The logs, metrics and traces usernames are the `User / Username / Instance IDs` of the Loki, Prometheus/Mimir and OpenTelemetry instances in Grafana Cloud. From `Home` in Grafana click on `Stacks`. Then go to the `Details` pages of Loki and Prometheus/Mimir. For OpenTelemetry go to the `Configure` page.
1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). Fill in the names of the secrets created above as needed. An example minimal values.yaml looks like this: 1. Create a values.yaml file based on the [default one](../charts/meta-monitoring/values.yaml). Fill in the names of the secrets created above as needed. An example minimal values.yaml looks like this:
@ -102,7 +102,7 @@
enabled: true enabled: true
``` ```
## Installing, updating and deleting the chart ## Installing the chart
1. Add the repo 1. Add the repo
@ -175,7 +175,7 @@ For each of the dashboard files in charts/meta-monitoring/src/dashboards folder
## Configure Loki to send traces ## Configure Loki to send traces
1. In the Loki that is being monitored enable tracing in the config: 1. In the Loki config enable tracing:
``` ```
loki: loki:
@ -195,7 +195,3 @@ For each of the dashboard files in charts/meta-monitoring/src/dashboards folder
## Configure external access using an Ingress in local mode ## Configure external access using an Ingress in local mode
When using local mode by default a Kubernetes [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) object is created to access the Grafana instance. This will need to be adapted to your cloud provider by updating the `grafana.ingress` section of the `values.yaml` file provided to Helm. Check the documentation of your cloud provider for available options. When using local mode by default a Kubernetes [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) object is created to access the Grafana instance. This will need to be adapted to your cloud provider by updating the `grafana.ingress` section of the `values.yaml` file provided to Helm. Check the documentation of your cloud provider for available options.
## Kube-state-metrics
Metrics about Kubernetes objects are scraped from [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics). This needs to be installed in the cluster. The `kubeStateMetrics.endpoint` entry in values.yaml should be set to it's address (without the `/metrics` part in the URL).