From 8f7beabe4c0276354194e346a3c5b390a43c5bc1 Mon Sep 17 00:00:00 2001 From: icecheng Date: Tue, 23 Sep 2025 14:20:52 +0800 Subject: [PATCH] feat: add prometheusrule for metrics service --- ...r and Error Alter Integration Guideline.md | 32 +++++++++ .../templates/metrics/prometheusrule.yaml | 37 ++++++++++ freeleaps/helm-pkg/metrics/values.alpha.yaml | 27 +++++++ freeleaps/helm-pkg/metrics/values.prod.yaml | 26 +++++++ freeleaps/helm-pkg/metrics/values.yaml | 72 ++++++++++++++++++- .../prod/ci/freeleaps-service-hub/Jenkinsfile | 42 +++++------ 6 files changed, 212 insertions(+), 24 deletions(-) create mode 100644 docs/Service Monitor and Error Alter Integration Guideline.md create mode 100644 freeleaps/helm-pkg/metrics/templates/metrics/prometheusrule.yaml diff --git a/docs/Service Monitor and Error Alter Integration Guideline.md b/docs/Service Monitor and Error Alter Integration Guideline.md new file mode 100644 index 00000000..0d9369b9 --- /dev/null +++ b/docs/Service Monitor and Error Alter Integration Guideline.md @@ -0,0 +1,32 @@ +# Prometheus Alter Rule Config + +Add `prometheusrule.yaml` to `/templates`. +see +``` +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if and .Values.metrics.enabled .Values.metrics.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "common.names.fullname" . }} + namespace: {{ default (include "common.names.namespace" .) .Values.metrics.prometheusRule.namespace | quote}} + labels: {{- include "common.labels.standard" ( dict "customLabels" .Values.commonLabels "context" $ ) | nindent 4 }} + {{- if .Values.metrics.prometheusRule.additionalLabels }} + {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} + {{- end }} + {{- if .Values.commonAnnotations }} + annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.metrics.prometheusRule.rules }} + - name: {{ template "common.names.name" $ }} + rules: {{- include "common.tplvalues.render" (dict "value" . "context" $) | nindent 8 }} + {{- end }} +{{- end }} + +``` \ No newline at end of file diff --git a/freeleaps/helm-pkg/metrics/templates/metrics/prometheusrule.yaml b/freeleaps/helm-pkg/metrics/templates/metrics/prometheusrule.yaml new file mode 100644 index 00000000..c9877e0e --- /dev/null +++ b/freeleaps/helm-pkg/metrics/templates/metrics/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.metrics.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.metrics.prometheusRule.name }} + namespace: {{ .Values.metrics.prometheusRule.namespace | quote }} + {{- with .Values.metrics.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.metrics.prometheusRule.rules }} + - name: {{ $.Values.metrics.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/metrics/values.alpha.yaml b/freeleaps/helm-pkg/metrics/values.alpha.yaml index 6972690d..709beac5 100644 --- a/freeleaps/helm-pkg/metrics/values.alpha.yaml +++ b/freeleaps/helm-pkg/metrics/values.alpha.yaml @@ -81,3 +81,30 @@ metrics: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-alpha-metrics + enabled: false + namespace: "freeleaps-monitoring-system" + rules: + - alert: FreeleapsMetricsServiceDown + expr: up{job="metrics-service"} == 0 + for: 1m + labels: + severity: critical + service: metrics-service + annotations: + summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" + description: "Freeleaps Metrics service has been down for more than 1 minutes." + runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" + + - alert: FreeleapsMetricsServiceHighErrorRate + expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: metrics-service + annotations: + summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" + description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." + runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" + diff --git a/freeleaps/helm-pkg/metrics/values.prod.yaml b/freeleaps/helm-pkg/metrics/values.prod.yaml index 9c036f1f..618c52a2 100644 --- a/freeleaps/helm-pkg/metrics/values.prod.yaml +++ b/freeleaps/helm-pkg/metrics/values.prod.yaml @@ -81,3 +81,29 @@ metrics: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-prod-metrics + enabled: true + namespace: "freeleaps-monitoring-system" + rules: + - alert: FreeleapsMetricsServiceDown + expr: up{job="metrics-service"} == 0 + for: 1m + labels: + severity: critical + service: metrics-service + annotations: + summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" + description: "Freeleaps Metrics service has been down for more than 1 minutes." + runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" + + - alert: FreeleapsMetricsServiceHighErrorRate + expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: metrics-service + annotations: + summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" + description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." + runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" \ No newline at end of file diff --git a/freeleaps/helm-pkg/metrics/values.yaml b/freeleaps/helm-pkg/metrics/values.yaml index 2aefffe1..5a1d5c8f 100644 --- a/freeleaps/helm-pkg/metrics/values.yaml +++ b/freeleaps/helm-pkg/metrics/values.yaml @@ -55,12 +55,12 @@ metrics: port: 8009 targetPort: 8009 serviceMonitor: - enabled: false + enabled: true labels: release: kube-prometheus-stack namespace: freeleaps-monitoring-system - internal: 30s - scrapeTimeout: '' + interval: 30s + scrapeTimeout: 10s configs: starrocksHost: "" starrocksPort: 8009 @@ -80,3 +80,69 @@ metrics: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-metrics + enabled: true + namespace: "freeleaps-monitoring-system" + rules: + - alert: FreeleapsMetricsServiceDown + expr: up{job="metrics-service"} == 0 + for: 1m + labels: + severity: critical + service: metrics-service + annotations: + summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" + description: "Freeleaps Metrics service has been down for more than 1 minutes." + runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" + + - alert: FreeleapsMetricsServiceHighErrorRate + expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: metrics-service + annotations: + summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" + description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." + runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" + + # - alert: MetricsServiceHighLatency + # expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="metrics-service"}[5m])) > 1 + # for: 5m + # labels: + # severity: warning + # service: metrics-service + # annotations: + # summary: "High latency in metrics service (instance {{ $labels.instance }})" + # description: "95th percentile latency is {{ $value }} seconds." + + # - alert: MetricsServiceHighMemoryUsage + # expr: (process_resident_memory_bytes{job="metrics-service"} / 1024 / 1024) > 512 + # for: 5m + # labels: + # severity: warning + # service: metrics + # annotations: + # summary: "High memory usage in metrics service (instance {{ $labels.instance }})" + # description: "Memory usage is {{ $value }} MB." + + # - alert: MetricsServiceHighCPUUsage + # expr: rate(process_cpu_seconds_total{job="metrics-service"}[5m]) * 100 > 80 + # for: 5m + # labels: + # severity: warning + # service: metrics + # annotations: + # summary: "High CPU usage in metrics service (instance {{ $labels.instance }})" + # description: "CPU usage is {{ $value }}%." + + # - alert: MetricsServiceNoData + # expr: absent(up{job="metrics-service"}) + # for: 5m + # labels: + # severity: critical + # service: metrics + # annotations: + # summary: "No data from metrics service (instance {{ $labels.instance }})" + # description: "No metrics data received from metrics service for more than 5 minutes." diff --git a/freeleaps/prod/ci/freeleaps-service-hub/Jenkinsfile b/freeleaps/prod/ci/freeleaps-service-hub/Jenkinsfile index a5f8ec5c..c1faec17 100644 --- a/freeleaps/prod/ci/freeleaps-service-hub/Jenkinsfile +++ b/freeleaps/prod/ci/freeleaps-service-hub/Jenkinsfile @@ -10,6 +10,27 @@ executeFreeleapsPipeline { executeMode = 'fully' commitMessageLintEnabled = false components = [ + [ + name: 'metrics', + root: 'apps/metrics', + language: 'python', + dependenciesManager: 'pip', + requirementsFile: 'requirements.txt', + buildCacheEnabled: true, + buildAgentImage: 'python:3.12-slim', + buildArtifacts: ['.'], + lintEnabled: false, + sastEnabled: false, + imageRegistry: 'docker.io', + imageRepository: 'freeleaps', + imageName: 'devops', + imageBuilder: 'dind', + dockerfilePath: 'Dockerfile', + imageBuildRoot: '.', + imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'], + registryCredentialsId: 'freeleaps-devops-docker-hub-credentials', + semanticReleaseEnabled: true + ], [ name: 'authentication', root: 'apps/authentication', @@ -135,27 +156,6 @@ executeFreeleapsPipeline { imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'], registryCredentialsId: 'freeleaps-devops-docker-hub-credentials', semanticReleaseEnabled: true - ], - [ - name: 'metrics', - root: 'apps/metrcis', - language: 'python', - dependenciesManager: 'pip', - requirementsFile: 'requirements.txt', - buildCacheEnabled: true, - buildAgentImage: 'python:3.12-slim', - buildArtifacts: ['.'], - lintEnabled: false, - sastEnabled: false, - imageRegistry: 'docker.io', - imageRepository: 'freeleaps', - imageName: 'devops', - imageBuilder: 'dind', - dockerfilePath: 'Dockerfile', - imageBuildRoot: '.', - imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'], - registryCredentialsId: 'freeleaps-devops-docker-hub-credentials', - semanticReleaseEnabled: true ] ] } \ No newline at end of file