From 6bb31ad5e08f58ae993451c3f24460bfadee92e9 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Thu, 4 Apr 2024 11:19:34 +0100 Subject: [PATCH] Filter out metrics not in list Signed-off-by: Michel Hollands --- .../templates/agent/config.yaml | 20 +++-- .../meta-monitoring/templates/validate.yaml | 4 + charts/meta-monitoring/values.yaml | 74 +++++++++++++++++++ 3 files changed, 93 insertions(+), 5 deletions(-) diff --git a/charts/meta-monitoring/templates/agent/config.yaml b/charts/meta-monitoring/templates/agent/config.yaml index d922d41..7d59c46 100644 --- a/charts/meta-monitoring/templates/agent/config.yaml +++ b/charts/meta-monitoring/templates/agent/config.yaml @@ -127,6 +127,16 @@ data: enabled = true } targets = discovery.relabel.only_http_metrics.output + forward_to = [ prometheus.relabel.filter.receiver ] + } + + prometheus.relabel "filter" { + rule { + source_labels = ["__name__"] + regex = "({{ join "|" .Values.metrics.retain }})" + action = "keep" + } + forward_to = [ {{ include "agent.prometheus_write_targets" . }} ] } {{- if .Values.kubeStateMetrics.enabled }} @@ -136,11 +146,11 @@ data: enabled = true } targets = [ { "__address__" = "{{ .Values.kubeStateMetrics.endpoint }}" } ] - forward_to = [ {{ include "agent.prometheus_write_targets" . }} ] + forward_to = [ prometheus.relabel.filter.receiver ] } {{- end }} - // cAdvisor and Kubelete metrics + // cAdvisor and Kubelet metrics // Based on https://github.com/Chewie/loutretelecom-manifests/blob/main/manifests/addons/monitoring/config.river discovery.kubernetes "all_nodes" { role = "node" @@ -178,7 +188,7 @@ data: enabled = true } targets = discovery.relabel.all_nodes.output - forward_to = [ {{ include "agent.prometheus_write_targets" . }} ] + forward_to = [ prometheus.relabel.filter.receiver ] metrics_path = "/metrics/cadvisor" scheme = "https" @@ -194,7 +204,7 @@ data: enabled = true } targets = discovery.relabel.all_nodes.output - forward_to = [ {{ include "agent.prometheus_write_targets" . }} ] + forward_to = [ prometheus.relabel.filter.receiver ] metrics_path = "/metrics" scheme = "https" @@ -218,7 +228,7 @@ data: } prometheus.relabel "node_exporter" { - forward_to = [ {{ include "agent.prometheus_write_targets" . }} ] + forward_to = [ prometheus.relabel.filter.receiver ] rule { replacement = env("HOSTNAME") diff --git a/charts/meta-monitoring/templates/validate.yaml b/charts/meta-monitoring/templates/validate.yaml index 0377477..d775438 100644 --- a/charts/meta-monitoring/templates/validate.yaml +++ b/charts/meta-monitoring/templates/validate.yaml @@ -37,3 +37,7 @@ {{- if empty .Values.namespacesToMonitor -}} {{- fail "No namespaces have been specified in namespacesToMonitor" -}} {{- end -}} + +{{- if empty .Values.metrics.retain -}} + {{- fail "All metrics will be collected, please specify some in metrics.retain" -}} +{{- end -}} diff --git a/charts/meta-monitoring/values.yaml b/charts/meta-monitoring/values.yaml index eda8c7c..9d394a1 100644 --- a/charts/meta-monitoring/values.yaml +++ b/charts/meta-monitoring/values.yaml @@ -48,6 +48,80 @@ logs: - level=error # This shows the ingest requests and is very noisy. Uncomment to include. # - caller=push.go + # Log lines for delete requests + - delete request for user added + - Started processing delete request + - delete request for user marked as processed + +metrics: + # The list of metrics to retain for logging dashboards + retain: + - container_cpu_usage_seconds_total + - container_fs_writes_bytes_total + - container_memory_working_set_bytes + - container_network_receive_bytes_total + - container_network_transmit_bytes_total + - container_spec_cpu_period + - container_spec_cpu_quota + - container_spec_memory_limit_bytes + - cortex_ingester_flush_queue_length + - go_gc_duration_seconds + - go_goroutines + - go_memstats_heap_inuse_bytes + - kubelet_volume_stats_used_bytes + - kubelet_volume_stats_capacity_bytes + - kube_persistentvolumeclaim_labels + - kube_pod_container_resource_requests + - kube_pod_container_status_last_terminated_reason + - kube_pod_container_status_restarts_total + - loki_boltdb_shipper_compact_tables_operation_duration_seconds + - loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds + - loki_boltdb_shipper_retention_marker_count_total + - loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_bucket + - loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_count + - loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_sum + - loki_boltdb_shipper_retention_marker_table_processed_total + - loki_boltdb_shipper_request_duration_seconds_bucket + - loki_boltdb_shipper_request_duration_seconds_count + - loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_count + - loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_sum + - loki_boltdb_shipper_retention_sweeper_marker_files_current + - loki_boltdb_shipper_retention_sweeper_marker_file_processing_current_time + - loki_chunk_store_index_entries_per_chunk_count + - loki_chunk_store_index_entries_per_chunk_sum + - loki_compactor_delete_requests_processed_total + - loki_compactor_delete_requests_received_total + - loki_compactor_deleted_lines + - loki_compactor_oldest_pending_delete_request_age_seconds + - loki_compactor_pending_delete_requests_count + - loki_distributor_lines_received_total + - loki_ingester_chunk_age_seconds_bucket + - loki_ingester_chunk_age_seconds_count + - loki_ingester_chunk_age_seconds_sum + - loki_ingester_chunk_bounds_hours_bucket + - loki_ingester_chunk_bounds_hours_count + - loki_ingester_chunk_bounds_hours_sum + - loki_ingester_chunk_entries_bucket + - loki_ingester_chunk_entries_count + - loki_ingester_chunk_entries_sum + - loki_ingester_chunk_size_bytes_bucket + - loki_ingester_chunk_utilization_bucket + - loki_ingester_chunk_utilization_sum + - loki_ingester_chunks_flushed_total + - loki_ingester_memory_chunks + - loki_ingester_memory_streams + - loki_request_duration_seconds_count + - loki_ruler_wal_appender_ready + - loki_ruler_wal_disk_size + - loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds + - loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds + - loki_ruler_wal_prometheus_remote_storage_samples_pending + - loki_ruler_wal_prometheus_remote_storage_samples_total + - loki_ruler_wal_samples_appended_total + - loki_ruler_wal_storage_created_series_total + - node_disk_read_bytes_total + - node_disk_written_bytes_total + - promtail_custom_bad_words_total # Set enabled = true to add the default logs/metrics/traces dashboards to the local Grafana dashboards: