freeleaps-ops/freeleaps/helm-pkg/metrics/values.yaml

151 lines
4.6 KiB
YAML

global:
registry: docker.io
repository: freeleaps
nodeSelector: {}
logIngest:
enabled: false
dashboard:
enabled: false
name: freeleaps-metrics-dashboard
title: Freeleaps Metrics Dashboard
metrics:
replicas: 1
image:
registry: docker.io
repository: null
name: metrics
tag: "1.0.0"
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 8009
protocol: TCP
resources:
requests:
cpu: '0.1'
memory: 64Mi
limits:
cpu: '0.2'
memory: 128Mi
probes:
liveness:
type: httpGet
config:
path: /api/_/livez
port: 8009
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 10
successThreshold: 1
failureThreshold: 5
terminationGracePeriodSeconds: 30
readiness:
type: httpGet
config:
path: /api/_/readyz
port: 8009
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 10
successThreshold: 1
failureThreshold: 5
services:
- name: metrics-service
type: ClusterIP
port: 8009
targetPort: 8009
serviceMonitor:
enabled: true
labels:
release: kube-prometheus-stack
namespace: freeleaps-monitoring-system
interval: 30s
scrapeTimeout: 10s
configs:
starrocksHost: ""
starrocksPort: 8009
starrocksUser: ""
starrocksPassword: ""
starrocksDatabase: ""
prometheusEndpoint: ""
vpa:
minAllowed:
enabled: false
cpu: 100m
memory: 64Mi
maxAllowed:
enabled: true
cpu: 100m
memory: 128Mi
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-metrics
enabled: true
namespace: "freeleaps-monitoring-system"
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsMetricsServiceDown
expr: up{job="metrics-service"} == 0
for: 1m
labels:
severity: critical
service: metrics-service
annotations:
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service has been down for more than 1 minutes."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
- alert: FreeleapsMetricsServiceHighErrorRate
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: metrics-service
annotations:
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
# - alert: MetricsServiceHighLatency
# expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="metrics-service"}[5m])) > 1
# for: 5m
# labels:
# severity: warning
# service: metrics-service
# annotations:
# summary: "High latency in metrics service (instance {{ $labels.instance }})"
# description: "95th percentile latency is {{ $value }} seconds."
# - alert: MetricsServiceHighMemoryUsage
# expr: (process_resident_memory_bytes{job="metrics-service"} / 1024 / 1024) > 512
# for: 5m
# labels:
# severity: warning
# service: metrics
# annotations:
# summary: "High memory usage in metrics service (instance {{ $labels.instance }})"
# description: "Memory usage is {{ $value }} MB."
# - alert: MetricsServiceHighCPUUsage
# expr: rate(process_cpu_seconds_total{job="metrics-service"}[5m]) * 100 > 80
# for: 5m
# labels:
# severity: warning
# service: metrics
# annotations:
# summary: "High CPU usage in metrics service (instance {{ $labels.instance }})"
# description: "CPU usage is {{ $value }}%."
# - alert: MetricsServiceNoData
# expr: absent(up{job="metrics-service"})
# for: 5m
# labels:
# severity: critical
# service: metrics
# annotations:
# summary: "No data from metrics service (instance {{ $labels.instance }})"
# description: "No metrics data received from metrics service for more than 5 minutes."