Michel Hollands edc556b074 Add default secret for testing
Signed-off-by: Michel Hollands <michel.hollands@gmail.com>
2024-05-22 16:53:26 +01:00

497 lines
16 KiB
YAML

# Specify the namespaces to monitor here
namespacesToMonitor:
- loki
# The name of the cluster where this will be installed
clusterLabelValue: "meta"
# Set to true to write logs, metrics or traces to Grafana Cloud
# The secrets have to be created first
cloud:
logs:
enabled: true
secret: "logs"
metrics:
enabled: true
secret: "metrics"
traces:
enabled: true
secret: "traces"
# Set to true for a local version of logs, metrics or traces
local:
grafana:
enabled: false
logs:
enabled: false
metrics:
enabled: false
traces:
enabled: false
minio:
enabled: false # This should be set to true if any of the previous is enabled
createSecret: false # This is used for testing, do not use in production
grafana:
version: 10.4.2
# Gateway ingress configuration
ingress:
# -- Specifies whether an ingress for the gateway should be created
enabled: true
# -- Ingress Class Name. MAY be required for Kubernetes versions >= 1.18
ingressClassName: ""
# -- Annotations for the gateway ingress
annotations: {}
# -- Labels for the gateway ingress
labels: {}
# -- Hosts configuration for the gateway ingress, passed through the `tpl` function to allow templating
hosts:
- host: monitoring.example.com
paths:
- path: /
# -- pathType (e.g. ImplementationSpecific, Prefix, .. etc.) might also be required by some Ingress Controllers
pathType: Prefix
# backend:
# service:
# name: TODO
# port:
# number: TODO
# -- TLS configuration for the gateway ingress. Hosts passed through the `tpl` function to allow templating
# tls:
# - secretName: grafana-tls
# hosts:
# - monitoring.example.com
logs:
# Adding regexes here will add a stage.replace block for logs. For more information see
# https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block
piiRegexes: null # This example replaces the word after password with *****
# - expression: "password (\\\\S+)"
# source: "" # Empty uses the log message
# replace: "*****""
# The lines matching these will be kept in Loki
retain:
# This shows the queries
- caller=metrics.go
# This shows any errors
- level=error
# Log lines for delete requests
- delete request for user added
- Started processing delete request
- delete request for user marked as processed
# This shows the ingest requests and is very noisy. Uncomment to include.
# - caller=push.go
# Additional log lines to retain
extraLogs: []
metrics:
# The list of metrics to retain for logging dashboards
retain:
- alloy_build_info
- alloy_config_last_load_success_timestamp_seconds
- alloy_config_last_load_successful
- alloy_config_load_failures_total
- alloy_component_controller_evaluating
- alloy_component_dependencies_wait_seconds
- alloy_component_dependencies_wait_seconds_bucket
- alloy_component_evaluation_seconds
- alloy_component_evaluation_seconds_bucket
- alloy_component_evaluation_seconds_count
- alloy_component_evaluation_seconds_sum
- alloy_component_evaluation_slow_seconds
- alloy_component_controller_running_components
- alloy_resources_machine_rx_bytes_total
- alloy_resources_machine_tx_bytes_total
- alloy_resources_process_cpu_seconds_total
- alloy_resources_process_resident_memory_bytes
- prometheus_remote_write_wal_samples_appended_total
- prometheus_remote_write_wal_storage_active_series
- cluster_node_info
- cluster_node_lamport_time
- cluster_node_update_observers
- cluster_node_gossip_health_score
- cluster_node_gossip_proto_version
- cluster_node_gossip_received_events_total
- cluster_node_peers
- cluster_transport_rx_bytes_total
- cluster_transport_rx_packets_total
- cluster_transport_rx_packets_failed_total
- cluster_transport_stream_rx_bytes_total
- cluster_transport_stream_rx_packets_failed_total
- cluster_transport_stream_rx_packets_total
- cluster_transport_stream_tx_bytes_total
- cluster_transport_stream_tx_packets_total
- cluster_transport_stream_tx_packets_failed_total
- cluster_transport_streams
- cluster_transport_tx_packets_total
- cluster_transport_tx_packets_failed_total
- cluster_transport_rx_packet_queue_length
- cluster_transport_tx_packet_queue_length
- container_cpu_usage_seconds_total
- container_fs_writes_bytes_total
- container_memory_working_set_bytes
- container_network_receive_bytes_total
- container_network_transmit_bytes_total
- container_spec_cpu_period
- container_spec_cpu_quota
- container_spec_memory_limit_bytes
- cortex_ingester_flush_queue_length
- cortex_prometheus_rule_group_iterations_total
- cortex_prometheus_rule_evaluation_failures_total
- cortex_prometheus_rule_group_rules
- cortex_prometheus_rule_group_last_duration_seconds
- cortex_prometheus_rule_group_last_evaluation_timestamp_seconds
- cortex_prometheus_rule_group_iterations_missed_total
- exporter_send_failed_spans_ratio_total
- exporter_sent_spans_ratio_total
- go_gc_duration_seconds
- go_gc_duration_seconds_count
- go_goroutines
- go_memstats_heap_inuse_bytes
- kubelet_volume_stats_used_bytes
- kubelet_volume_stats_capacity_bytes
- kube_deployment_created
- kube_persistentvolumeclaim_labels
- kube_pod_container_info
- kube_pod_container_resource_requests
- kube_pod_container_status_last_terminated_reason
- kube_pod_container_status_restarts_total
- loki_azure_blob_request_duration_seconds_bucket
- loki_boltdb_shipper_compact_tables_operation_duration_seconds
- loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds
- loki_boltdb_shipper_retention_marker_count_total
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_bucket
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_count
- loki_boltdb_shipper_retention_marker_table_processed_duration_seconds_sum
- loki_boltdb_shipper_retention_marker_table_processed_total
- loki_boltdb_shipper_request_duration_seconds_bucket
- loki_boltdb_shipper_request_duration_seconds_count
- loki_boltdb_shipper_request_duration_seconds_sum
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_bucket
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_count
- loki_boltdb_shipper_retention_sweeper_chunk_deleted_duration_seconds_sum
- loki_boltdb_shipper_retention_sweeper_marker_files_current
- loki_boltdb_shipper_retention_sweeper_marker_file_processing_current_time
- loki_build_info
- loki_chunk_store_deduped_chunks_total
- loki_chunk_store_index_entries_per_chunk_bucket
- loki_chunk_store_index_entries_per_chunk_count
- loki_chunk_store_index_entries_per_chunk_sum
- loki_compactor_delete_requests_processed_total
- loki_compactor_delete_requests_received_total
- loki_compactor_deleted_lines
- loki_compactor_oldest_pending_delete_request_age_seconds
- loki_compactor_pending_delete_requests_count
- loki_consul_request_duration_seconds_bucket
- loki_discarded_samples_total
- loki_discarded_bytes_total
- loki_distributor_bytes_received_total
- loki_distributor_lines_received_total
- loki_distributor_structured_metadata_bytes_received_total
- loki_gcs_request_duration_seconds_bucket
- loki_gcs_request_duration_seconds_count
- loki_index_request_duration_seconds_bucket
- loki_index_request_duration_seconds_count
- loki_ingester_chunk_age_seconds_bucket
- loki_ingester_chunk_age_seconds_count
- loki_ingester_chunk_age_seconds_sum
- loki_ingester_chunk_bounds_hours_bucket
- loki_ingester_chunk_bounds_hours_count
- loki_ingester_chunk_bounds_hours_sum
- loki_ingester_chunk_entries_bucket
- loki_ingester_chunk_entries_count
- loki_ingester_chunk_entries_sum
- loki_ingester_chunk_size_bytes_bucket
- loki_ingester_chunk_utilization_bucket
- loki_ingester_chunk_utilization_count
- loki_ingester_chunk_utilization_sum
- loki_ingester_chunks_flushed_total
- loki_ingester_flush_queue_length
- loki_ingester_memory_chunks
- loki_ingester_memory_streams
- loki_ingester_streams_created_total
- loki_request_duration_seconds_bucket
- loki_request_duration_seconds_count
- loki_request_duration_seconds_sum
- loki_ruler_wal_appender_ready
- loki_ruler_wal_disk_size
- loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds
- loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds
- loki_ruler_wal_prometheus_remote_storage_samples_pending
- loki_ruler_wal_prometheus_remote_storage_samples_total
- loki_ruler_wal_samples_appended_total
- loki_ruler_wal_storage_created_series_total
- loki_s3_request_duration_seconds_bucket
- loki_s3_request_duration_seconds_count
- loki_write_batch_retries_total
- loki_write_dropped_bytes_total
- loki_write_dropped_entries_total
- loki_write_sent_bytes_total
- loki_write_sent_entries_total
- node_disk_read_bytes_total
- node_disk_written_bytes_total
- process_start_time_seconds
- processor_batch_batch_send_size_ratio_bucket
- processor_batch_metadata_cardinality_ratio
- processor_batch_timeout_trigger_send_ratio_total
- prometheus_remote_storage_bytes_total
- prometheus_remote_storage_enqueue_retries_total
- prometheus_remote_storage_highest_timestamp_in_seconds
- prometheus_remote_storage_metadata_bytes_total
- prometheus_remote_storage_queue_highest_sent_timestamp_seconds
- prometheus_remote_storage_samples_dropped_total
- prometheus_remote_storage_samples_failed_total
- prometheus_remote_storage_samples_pending
- prometheus_remote_storage_samples_retried_total
- prometheus_remote_storage_samples_total
- prometheus_remote_storage_sent_batch_duration_seconds_bucket
- prometheus_remote_storage_sent_batch_duration_seconds_count
- prometheus_remote_storage_sent_batch_duration_seconds_sum
- prometheus_remote_storage_shard_capacity
- prometheus_remote_storage_shards
- prometheus_remote_storage_shards_desired
- prometheus_remote_storage_shards_max
- prometheus_remote_storage_shards_min
- prometheus_remote_storage_succeeded_samples_total
- prometheus_remote_write_wal_samples_appended_total
- prometheus_remote_write_wal_storage_active_series
- prometheus_sd_discovered_targets
- prometheus_target_interval_length_seconds_count
- prometheus_target_interval_length_seconds_sum
- prometheus_target_scrapes_exceeded_sample_limit_total
- prometheus_target_scrapes_sample_duplicate_timestamp_total
- prometheus_target_scrapes_sample_out_of_bounds_total
- prometheus_target_scrapes_sample_out_of_order_total
- prometheus_target_sync_length_seconds_sum
- prometheus_wal_watcher_current_segment
- promtail_custom_bad_words_total
- promtail_dropped_bytes_total
- promtail_files_active_total
- promtail_read_bytes_total
- promtail_read_lines_total
- promtail_request_duration_seconds_bucket
- promtail_sent_entries_total
- rpc_server_duration_milliseconds_bucket
- receiver_accepted_spans_ratio_total
- receiver_refused_spans_ratio_total
- scrape_duration_seconds
- traces_exporter_sent_spans
- traces_exporter_send_failed_spans
- traces_loadbalancer_backend_outcome
- traces_loadbalancer_num_backends
- traces_receiver_accepted_spans
- traces_receiver_refused_spans
- up
# Additional metrics to retain
extraMetrics: []
# Set enabled = true to add the default logs dashboards to the local Grafana
dashboards:
logs:
enabled: true
kubeStateMetrics:
# Scrape https://github.com/kubernetes/kube-state-metrics by default
enabled: true
# This endpoint is created when the helm chart from
# https://artifacthub.io/packages/helm/prometheus-community/kube-state-metrics/
# is used. Change this if kube-state-metrics is installed somewhere else.
endpoint: kube-state-metrics.kube-state-metrics.svc.cluster.local:8080
# The following are configuration for the dependencies.
# These should usually not be changed.
loki:
loki:
auth_enabled: false
schemaConfig:
configs:
- from: 2024-03-29
store: tsdb
object_store: s3
schema: v13
index:
prefix: index_
period: 24h
storage:
type: "s3"
s3:
insecure: true
s3ForcePathStyle: true
bucketNames:
chunks: loki-chunks
ruler: loki-ruler
structuredConfig:
common:
storage:
s3:
access_key_id: "${rootUser}"
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
secret_access_key: "${rootPassword}"
compactor:
retention_enabled: true
delete_request_store: s3
limits_config:
retention_period: 30d
lokiCanary:
enabled: false
test:
enabled: false
monitoring:
dashboards:
enabled: false
rules:
enabled: false
serviceMonitor:
enabled: false
selfMonitoring:
enabled: false
grafanaAgent:
installOperator: false
lokiCanary:
enabled: false
write:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
read:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
backend:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
alloy:
alloy:
clustering:
enabled: true
configMap:
create: false
name: "agent-configmap"
key: 'config.river'
resources:
requests:
cpu: '1000m'
memory: '600Mi'
limits:
memory: '4Gi'
extraPorts:
- name: "otel"
port: 4317
targetPort: 4317
protocol: "TCP"
- name: "thrifthttp"
port: 14268
targetPort: 14268
protocol: "TCP"
controller:
type: "statefulset"
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 30
targetMemoryUtilizationPercentage: 90
targetCPUUtilizationPercentage: 90
mimir-distributed:
minio:
enabled: false
global:
extraEnvFrom:
- secretRef:
name: "minio"
mimir:
structuredConfig:
alertmanager_storage:
s3:
bucket_name: mimir-ruler
blocks_storage:
backend: s3
s3:
bucket_name: mimir-tsdb
ruler_storage:
s3:
bucket_name: mimir-ruler
common:
storage:
backend: s3
s3:
bucket_name: mimir-ruler
access_key_id: "${rootUser}"
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
secret_access_key: "${rootPassword}"
insecure: true
limits:
compactor_blocks_retention_period: 30d
tempo-distributed:
tempo:
structuredConfig:
storage:
trace:
backend: s3
s3:
bucket: tempo
endpoint: "{{ .Release.Name }}-minio.{{ .Release.Namespace }}.svc:9000"
access_key: "${rootUser}"
secret_key: "${rootPassword}"
insecure: true
distributor:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
ingester:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
compactor:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
querier:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
queryFrontend:
extraArgs:
- "-config.expand-env=true"
extraEnvFrom:
- secretRef:
name: "minio"
traces:
otlp:
http:
enabled: true
grpc:
enabled: true
minio:
existingSecret: "minio"
buckets:
- name: loki-chunks
policy: none
purge: false
- name: loki-ruler
policy: none
purge: false
- name: tempo
policy: none
purge: false
- name: mimir-ruler
policy: none
purge: false
- name: mimir-tsdb
policy: none
purge: false
mode: standalone
persistence:
size: 5Gi
resources:
requests:
cpu: 100m
memory: 128Mi
# Changed the mc config path to '/tmp' from '/etc' as '/etc' is only writable by root and OpenShift will not permit this.
configPathmc: "/tmp/minio/mc/"