2025-09-18 05:46:24 +00:00
|
|
|
global:
|
|
|
|
|
registry: docker.io
|
|
|
|
|
repository: freeleaps
|
|
|
|
|
nodeSelector: {}
|
|
|
|
|
logIngest:
|
|
|
|
|
enabled: false
|
2025-09-22 08:42:10 +00:00
|
|
|
dashboard:
|
|
|
|
|
enabled: false
|
2025-09-22 09:42:50 +00:00
|
|
|
name: freeleaps-metrics-dashboard
|
2025-09-22 08:42:10 +00:00
|
|
|
title: Freeleaps Metrics Dashboard
|
2025-09-18 05:46:24 +00:00
|
|
|
metrics:
|
|
|
|
|
replicas: 1
|
|
|
|
|
image:
|
|
|
|
|
registry: docker.io
|
|
|
|
|
repository: null
|
|
|
|
|
name: metrics
|
|
|
|
|
tag: "1.0.0"
|
|
|
|
|
imagePullPolicy: IfNotPresent
|
|
|
|
|
ports:
|
|
|
|
|
- name: http
|
|
|
|
|
containerPort: 8009
|
|
|
|
|
protocol: TCP
|
|
|
|
|
resources:
|
|
|
|
|
requests:
|
|
|
|
|
cpu: '0.1'
|
|
|
|
|
memory: 64Mi
|
|
|
|
|
limits:
|
|
|
|
|
cpu: '0.2'
|
|
|
|
|
memory: 128Mi
|
|
|
|
|
probes:
|
|
|
|
|
liveness:
|
|
|
|
|
type: httpGet
|
|
|
|
|
config:
|
|
|
|
|
path: /api/_/livez
|
|
|
|
|
port: 8009
|
|
|
|
|
initialDelaySeconds: 30
|
|
|
|
|
periodSeconds: 10
|
|
|
|
|
timeoutSeconds: 10
|
|
|
|
|
successThreshold: 1
|
|
|
|
|
failureThreshold: 5
|
|
|
|
|
terminationGracePeriodSeconds: 30
|
|
|
|
|
readiness:
|
|
|
|
|
type: httpGet
|
|
|
|
|
config:
|
|
|
|
|
path: /api/_/readyz
|
|
|
|
|
port: 8009
|
|
|
|
|
initialDelaySeconds: 30
|
|
|
|
|
periodSeconds: 10
|
|
|
|
|
timeoutSeconds: 10
|
|
|
|
|
successThreshold: 1
|
|
|
|
|
failureThreshold: 5
|
|
|
|
|
services:
|
|
|
|
|
- name: metrics-service
|
|
|
|
|
type: ClusterIP
|
|
|
|
|
port: 8009
|
|
|
|
|
targetPort: 8009
|
2025-09-22 08:16:39 +00:00
|
|
|
serviceMonitor:
|
2025-09-23 06:20:52 +00:00
|
|
|
enabled: true
|
2025-09-22 08:16:39 +00:00
|
|
|
labels:
|
|
|
|
|
release: kube-prometheus-stack
|
|
|
|
|
namespace: freeleaps-monitoring-system
|
2025-09-23 06:20:52 +00:00
|
|
|
interval: 30s
|
|
|
|
|
scrapeTimeout: 10s
|
2025-09-18 05:46:24 +00:00
|
|
|
configs:
|
|
|
|
|
starrocksHost: ""
|
|
|
|
|
starrocksPort: 8009
|
|
|
|
|
starrocksUser: ""
|
|
|
|
|
starrocksPassword: ""
|
|
|
|
|
starrocksDatabase: ""
|
|
|
|
|
prometheusEndpoint: ""
|
|
|
|
|
vpa:
|
|
|
|
|
minAllowed:
|
|
|
|
|
enabled: false
|
|
|
|
|
cpu: 100m
|
|
|
|
|
memory: 64Mi
|
|
|
|
|
maxAllowed:
|
|
|
|
|
enabled: true
|
|
|
|
|
cpu: 100m
|
|
|
|
|
memory: 128Mi
|
|
|
|
|
controlledResources:
|
|
|
|
|
- cpu
|
|
|
|
|
- memory
|
2025-09-23 06:20:52 +00:00
|
|
|
prometheusRule:
|
|
|
|
|
name: freepeals-metrics
|
|
|
|
|
enabled: true
|
|
|
|
|
namespace: "freeleaps-monitoring-system"
|
2025-09-23 09:06:51 +00:00
|
|
|
labels:
|
|
|
|
|
release: kube-prometheus-stack
|
2025-09-23 06:20:52 +00:00
|
|
|
rules:
|
|
|
|
|
- alert: FreeleapsMetricsServiceDown
|
|
|
|
|
expr: up{job="metrics-service"} == 0
|
|
|
|
|
for: 1m
|
|
|
|
|
labels:
|
|
|
|
|
severity: critical
|
|
|
|
|
service: metrics-service
|
|
|
|
|
annotations:
|
|
|
|
|
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
|
|
|
|
|
description: "Freeleaps Metrics service has been down for more than 1 minutes."
|
|
|
|
|
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
|
|
|
|
|
|
|
|
|
- alert: FreeleapsMetricsServiceHighErrorRate
|
|
|
|
|
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
|
|
|
|
|
for: 5m
|
|
|
|
|
labels:
|
|
|
|
|
severity: warning
|
|
|
|
|
service: metrics-service
|
|
|
|
|
annotations:
|
|
|
|
|
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
|
|
|
|
|
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
|
|
|
|
|
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
|
|
|
|
|
|
|
|
|
# - alert: MetricsServiceHighLatency
|
|
|
|
|
# expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="metrics-service"}[5m])) > 1
|
|
|
|
|
# for: 5m
|
|
|
|
|
# labels:
|
|
|
|
|
# severity: warning
|
|
|
|
|
# service: metrics-service
|
|
|
|
|
# annotations:
|
|
|
|
|
# summary: "High latency in metrics service (instance {{ $labels.instance }})"
|
|
|
|
|
# description: "95th percentile latency is {{ $value }} seconds."
|
|
|
|
|
|
|
|
|
|
# - alert: MetricsServiceHighMemoryUsage
|
|
|
|
|
# expr: (process_resident_memory_bytes{job="metrics-service"} / 1024 / 1024) > 512
|
|
|
|
|
# for: 5m
|
|
|
|
|
# labels:
|
|
|
|
|
# severity: warning
|
|
|
|
|
# service: metrics
|
|
|
|
|
# annotations:
|
|
|
|
|
# summary: "High memory usage in metrics service (instance {{ $labels.instance }})"
|
|
|
|
|
# description: "Memory usage is {{ $value }} MB."
|
|
|
|
|
|
|
|
|
|
# - alert: MetricsServiceHighCPUUsage
|
|
|
|
|
# expr: rate(process_cpu_seconds_total{job="metrics-service"}[5m]) * 100 > 80
|
|
|
|
|
# for: 5m
|
|
|
|
|
# labels:
|
|
|
|
|
# severity: warning
|
|
|
|
|
# service: metrics
|
|
|
|
|
# annotations:
|
|
|
|
|
# summary: "High CPU usage in metrics service (instance {{ $labels.instance }})"
|
|
|
|
|
# description: "CPU usage is {{ $value }}%."
|
|
|
|
|
|
|
|
|
|
# - alert: MetricsServiceNoData
|
|
|
|
|
# expr: absent(up{job="metrics-service"})
|
|
|
|
|
# for: 5m
|
|
|
|
|
# labels:
|
|
|
|
|
# severity: critical
|
|
|
|
|
# service: metrics
|
|
|
|
|
# annotations:
|
|
|
|
|
# summary: "No data from metrics service (instance {{ $labels.instance }})"
|
|
|
|
|
# description: "No metrics data received from metrics service for more than 5 minutes."
|