diff --git a/freeleaps/helm-pkg/centralStorage/templates/central-storage/prometheusrule.yaml b/freeleaps/helm-pkg/centralStorage/templates/central-storage/prometheusrule.yaml new file mode 100644 index 00000000..dceb8364 --- /dev/null +++ b/freeleaps/helm-pkg/centralStorage/templates/central-storage/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.central-storage.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.central-storage.prometheusRule.name }} + namespace: {{ .Values.central-storage.prometheusRule.namespace | quote }} + {{- with .Values.central-storage.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.central-storage.prometheusRule.rules }} + - name: {{ $.Values.central-storage.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/centralStorage/values.alpha.yaml b/freeleaps/helm-pkg/centralStorage/values.alpha.yaml index c2383916..77273811 100644 --- a/freeleaps/helm-pkg/centralStorage/values.alpha.yaml +++ b/freeleaps/helm-pkg/centralStorage/values.alpha.yaml @@ -119,3 +119,30 @@ centralStorage: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-alpha-central-storage + enabled: false + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsCentralStorageServiceDown + expr: up{job="central-storage-service"} == 0 + for: 1m + labels: + severity: critical + service: central-storage-service + annotations: + summary: Freeleaps Central Storage service is down (instance {{ $labels.instance }}) + description: Freeleaps Central Storage service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsCentralStorageServiceHighErrorRate + expr: rate(http_requests_total{job="central-storage-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: central-storage-service + annotations: + summary: High error rate in freeleaps central storage service (instance {{ $labels.instance }}) + description: Freeleaps Central Storage service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/chat/templates/chat/prometheusrule.yaml b/freeleaps/helm-pkg/chat/templates/chat/prometheusrule.yaml new file mode 100644 index 00000000..7ee8417e --- /dev/null +++ b/freeleaps/helm-pkg/chat/templates/chat/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.chat.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.chat.prometheusRule.name }} + namespace: {{ .Values.chat.prometheusRule.namespace | quote }} + {{- with .Values.chat.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.chat.prometheusRule.rules }} + - name: {{ $.Values.chat.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/chat/values.alpha.yaml b/freeleaps/helm-pkg/chat/values.alpha.yaml index 4741aab7..7e43d891 100644 --- a/freeleaps/helm-pkg/chat/values.alpha.yaml +++ b/freeleaps/helm-pkg/chat/values.alpha.yaml @@ -154,3 +154,30 @@ chat: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-alpha-chat + enabled: false + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsChatServiceDown + expr: up{job="chat-service"} == 0 + for: 1m + labels: + severity: critical + service: chat-service + annotations: + summary: Freeleaps Chat service is down (instance {{ $labels.instance }}) + description: Freeleaps Chat service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsChatServiceHighErrorRate + expr: rate(http_requests_total{job="chat-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: chat-service + annotations: + summary: High error rate in freeleaps chat service (instance {{ $labels.instance }}) + description: Freeleaps Chat service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/chat/values.prod.yaml b/freeleaps/helm-pkg/chat/values.prod.yaml index 8419d50f..0fe91032 100644 --- a/freeleaps/helm-pkg/chat/values.prod.yaml +++ b/freeleaps/helm-pkg/chat/values.prod.yaml @@ -145,3 +145,31 @@ chat: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-prod-chat + enabled: true + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsChatServiceDown + expr: up{job="chat-service"} == 0 + for: 1m + labels: + severity: critical + service: chat-service + annotations: + summary: Freeleaps Chat service is down (instance {{ $labels.instance }}) + description: Freeleaps Chat service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsChatServiceHighErrorRate + expr: rate(http_requests_total{job="chat-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: chat-service + annotations: + summary: High error rate in freeleaps chat service (instance {{ $labels.instance }}) + description: Freeleaps Chat service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + diff --git a/freeleaps/helm-pkg/content/templates/content/prometheusrule.yaml b/freeleaps/helm-pkg/content/templates/content/prometheusrule.yaml new file mode 100644 index 00000000..c9ad52fa --- /dev/null +++ b/freeleaps/helm-pkg/content/templates/content/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.content.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.content.prometheusRule.name }} + namespace: {{ .Values.content.prometheusRule.namespace | quote }} + {{- with .Values.content.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.content.prometheusRule.rules }} + - name: {{ $.Values.content.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/content/values.alpha.yaml b/freeleaps/helm-pkg/content/values.alpha.yaml index eafb5ff3..5350692d 100644 --- a/freeleaps/helm-pkg/content/values.alpha.yaml +++ b/freeleaps/helm-pkg/content/values.alpha.yaml @@ -115,3 +115,30 @@ content: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-alpha-content + enabled: false + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsContentServiceDown + expr: up{job="content-service"} == 0 + for: 1m + labels: + severity: critical + service: content-service + annotations: + summary: Freeleaps Content service is down (instance {{ $labels.instance }}) + description: Freeleaps Content service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsContentServiceHighErrorRate + expr: rate(http_requests_total{job="content-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: content-service + annotations: + summary: High error rate in freeleaps content service (instance {{ $labels.instance }}) + description: Freeleaps Content service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/content/values.prod.yaml b/freeleaps/helm-pkg/content/values.prod.yaml index 0f6967f0..67024eb7 100644 --- a/freeleaps/helm-pkg/content/values.prod.yaml +++ b/freeleaps/helm-pkg/content/values.prod.yaml @@ -106,3 +106,30 @@ content: controlledResources: - cpu - memory +prometheusRule: + name: freepeals-prod-content + enabled: true + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsContentServiceDown + expr: up{job="content-service"} == 0 + for: 1m + labels: + severity: critical + service: content-service + annotations: + summary: Freeleaps Content service is down (instance {{ $labels.instance }}) + description: Freeleaps Content service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsContentServiceHighErrorRate + expr: rate(http_requests_total{job="content-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: content-service + annotations: + summary: High error rate in freeleaps content service (instance {{ $labels.instance }}) + description: Freeleaps Content service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/devops/templates/devops/prometheusrule.yaml b/freeleaps/helm-pkg/devops/templates/devops/prometheusrule.yaml new file mode 100644 index 00000000..fad378dc --- /dev/null +++ b/freeleaps/helm-pkg/devops/templates/devops/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.devops.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.devops.prometheusRule.name }} + namespace: {{ .Values.devops.prometheusRule.namespace | quote }} + {{- with .Values.devops.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.devops.prometheusRule.rules }} + - name: {{ $.Values.devops.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/devops/values.alpha.yaml b/freeleaps/helm-pkg/devops/values.alpha.yaml index 9390a35e..c35c8643 100644 --- a/freeleaps/helm-pkg/devops/values.alpha.yaml +++ b/freeleaps/helm-pkg/devops/values.alpha.yaml @@ -120,3 +120,30 @@ devops: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-alpha-devops + enabled: false + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsDevopsServiceDown + expr: up{job="devops-service"} == 0 + for: 1m + labels: + severity: critical + service: devops-service + annotations: + summary: Freeleaps Devops service is down (instance {{ $labels.instance }}) + description: Freeleaps Devops service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsDevopsServiceHighErrorRate + expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: devops-service + annotations: + summary: High error rate in freeleaps devops service (instance {{ $labels.instance }}) + description: Freeleaps Devops service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/devops/values.prod.yaml b/freeleaps/helm-pkg/devops/values.prod.yaml index c622a5ba..12d8322b 100644 --- a/freeleaps/helm-pkg/devops/values.prod.yaml +++ b/freeleaps/helm-pkg/devops/values.prod.yaml @@ -97,3 +97,30 @@ devops: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-prod-devops + enabled: true + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsDevopsServiceDown + expr: up{job="devops-service"} == 0 + for: 1m + labels: + severity: critical + service: devops-service + annotations: + summary: Freeleaps Devops service is down (instance {{ $labels.instance }}) + description: Freeleaps Devops service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsDevopsServiceHighErrorRate + expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: devops-service + annotations: + summary: High error rate in freeleaps devops service (instance {{ $labels.instance }}) + description: Freeleaps Devops service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/devsvc/templates/devsvc/prometheusrule.yaml b/freeleaps/helm-pkg/devsvc/templates/devsvc/prometheusrule.yaml new file mode 100644 index 00000000..b6819802 --- /dev/null +++ b/freeleaps/helm-pkg/devsvc/templates/devsvc/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.devsvc.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.devsvc.prometheusRule.name }} + namespace: {{ .Values.devsvc.prometheusRule.namespace | quote }} + {{- with .Values.devsvc.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.devsvc.prometheusRule.rules }} + - name: {{ $.Values.devsvc.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/devsvc/values.alpha.yaml b/freeleaps/helm-pkg/devsvc/values.alpha.yaml index 4785745e..667f04bf 100644 --- a/freeleaps/helm-pkg/devsvc/values.alpha.yaml +++ b/freeleaps/helm-pkg/devsvc/values.alpha.yaml @@ -147,3 +147,30 @@ devsvc: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-alpha-devsvc + enabled: false + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsDevsvcServiceDown + expr: up{job="devsvc-service"} == 0 + for: 1m + labels: + severity: critical + service: devsvc-service + annotations: + summary: Freeleaps Devsvc service is down (instance {{ $labels.instance }}) + description: Freeleaps Devsvc service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsDevsvcServiceHighErrorRate + expr: rate(http_requests_total{job="devsvc-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: devsvc-service + annotations: + summary: High error rate in freeleaps devsvc service (instance {{ $labels.instance }}) + description: Freeleaps Devsvc service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/devsvc/values.prod.yaml b/freeleaps/helm-pkg/devsvc/values.prod.yaml index c1df4992..aef6da08 100644 --- a/freeleaps/helm-pkg/devsvc/values.prod.yaml +++ b/freeleaps/helm-pkg/devsvc/values.prod.yaml @@ -138,3 +138,30 @@ devsvc: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-prod-devsvc + enabled: true + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsDevsvcServiceDown + expr: up{job="devsvc-service"} == 0 + for: 1m + labels: + severity: critical + service: devsvc-service + annotations: + summary: Freeleaps Devsvc service is down (instance {{ $labels.instance }}) + description: Freeleaps Devsvc service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsDevsvcServiceHighErrorRate + expr: rate(http_requests_total{job="devsvc-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: devsvc-service + annotations: + summary: High error rate in freeleaps devsvc service (instance {{ $labels.instance }}) + description: Freeleaps Devsvc service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/freeleaps/templates/freeleaps/prometheusrule.yaml b/freeleaps/helm-pkg/freeleaps/templates/freeleaps/prometheusrule.yaml new file mode 100644 index 00000000..9bf69b61 --- /dev/null +++ b/freeleaps/helm-pkg/freeleaps/templates/freeleaps/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.freeleaps.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.freeleaps.prometheusRule.name }} + namespace: {{ .Values.freeleaps.prometheusRule.namespace | quote }} + {{- with .Values.freeleaps.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.freeleaps.prometheusRule.rules }} + - name: {{ $.Values.freeleaps.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/freeleaps/values.alpha.yaml b/freeleaps/helm-pkg/freeleaps/values.alpha.yaml index f7419654..a69fb57a 100644 --- a/freeleaps/helm-pkg/freeleaps/values.alpha.yaml +++ b/freeleaps/helm-pkg/freeleaps/values.alpha.yaml @@ -141,3 +141,30 @@ freeleaps: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-alpha-freeleaps + enabled: false + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsFreeleapsServiceDown + expr: up{job="freeleaps-service"} == 0 + for: 1m + labels: + severity: critical + service: freeleaps-service + annotations: + summary: Freeleaps Freeleaps service is down (instance {{ $labels.instance }}) + description: Freeleaps Freeleaps service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsFreeleapsServiceHighErrorRate + expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: freeleaps-service + annotations: + summary: High error rate in freeleaps freeleaps service (instance {{ $labels.instance }}) + description: Freeleaps Freeleaps service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/freeleaps/values.prod.yaml b/freeleaps/helm-pkg/freeleaps/values.prod.yaml index 00abe70c..e7b7e0d5 100644 --- a/freeleaps/helm-pkg/freeleaps/values.prod.yaml +++ b/freeleaps/helm-pkg/freeleaps/values.prod.yaml @@ -132,3 +132,30 @@ freeleaps: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-prod-freeleaps + enabled: true + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsFreeleapsServiceDown + expr: up{job="freeleaps-service"} == 0 + for: 1m + labels: + severity: critical + service: freeleaps-service + annotations: + summary: Freeleaps Freeleaps service is down (instance {{ $labels.instance }}) + description: Freeleaps Freeleaps service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsFreeleapsServiceHighErrorRate + expr: rate(http_requests_total{job="freeleaps-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: freeleaps-service + annotations: + summary: High error rate in freeleaps freeleaps service (instance {{ $labels.instance }}) + description: Freeleaps Freeleaps service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/notification/templates/notification/prometheusrule.yaml b/freeleaps/helm-pkg/notification/templates/notification/prometheusrule.yaml new file mode 100644 index 00000000..cd587834 --- /dev/null +++ b/freeleaps/helm-pkg/notification/templates/notification/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.notification.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.notification.prometheusRule.name }} + namespace: {{ .Values.notification.prometheusRule.namespace | quote }} + {{- with .Values.notification.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.notification.prometheusRule.rules }} + - name: {{ $.Values.notification.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/notification/values.alpha.yaml b/freeleaps/helm-pkg/notification/values.alpha.yaml index c888b36c..2d55d4e7 100644 --- a/freeleaps/helm-pkg/notification/values.alpha.yaml +++ b/freeleaps/helm-pkg/notification/values.alpha.yaml @@ -143,3 +143,30 @@ notification: remoteRef: key: freeleaps-alpha-twilio-auth-token type: Secret + prometheusRule: + name: freepeals-alpha-notification + enabled: false + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsNotificationServiceDown + expr: up{job="notification-service"} == 0 + for: 1m + labels: + severity: critical + service: notification-service + annotations: + summary: Freeleaps Notification service is down (instance {{ $labels.instance }}) + description: Freeleaps Notification service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsNotificationServiceHighErrorRate + expr: rate(http_requests_total{job="notification-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: notification-service + annotations: + summary: High error rate in freeleaps notification service (instance {{ $labels.instance }}) + description: Freeleaps Notification service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/notification/values.prod.yaml b/freeleaps/helm-pkg/notification/values.prod.yaml index ff1f778a..44aceba7 100644 --- a/freeleaps/helm-pkg/notification/values.prod.yaml +++ b/freeleaps/helm-pkg/notification/values.prod.yaml @@ -129,3 +129,30 @@ notification: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-prod-notification + enabled: true + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsNotificationServiceDown + expr: up{job="notification-service"} == 0 + for: 1m + labels: + severity: critical + service: notification-service + annotations: + summary: Freeleaps Notification service is down (instance {{ $labels.instance }}) + description: Freeleaps Notification service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsNotificationServiceHighErrorRate + expr: rate(http_requests_total{job="notification-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: notification-service + annotations: + summary: High error rate in freeleaps notification service (instance {{ $labels.instance }}) + description: Freeleaps Notification service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/payment/templates/payment/prometheusrule.yaml b/freeleaps/helm-pkg/payment/templates/payment/prometheusrule.yaml new file mode 100644 index 00000000..5e17dbb2 --- /dev/null +++ b/freeleaps/helm-pkg/payment/templates/payment/prometheusrule.yaml @@ -0,0 +1,37 @@ +{{- /* +Copyright Broadcom, Inc. All Rights Reserved. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +{{- if .Values.payment.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Values.payment.prometheusRule.name }} + namespace: {{ .Values.payment.prometheusRule.namespace | quote }} + {{- with .Values.payment.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + {{- with .Values.payment.prometheusRule.rules }} + - name: {{ $.Values.payment.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/freeleaps/helm-pkg/payment/values.alpha.yaml b/freeleaps/helm-pkg/payment/values.alpha.yaml index 54300c40..3e46b785 100644 --- a/freeleaps/helm-pkg/payment/values.alpha.yaml +++ b/freeleaps/helm-pkg/payment/values.alpha.yaml @@ -115,3 +115,30 @@ payment: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-alpha-payment + enabled: false + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsPaymentServiceDown + expr: up{job="payment-service"} == 0 + for: 1m + labels: + severity: critical + service: payment-service + annotations: + summary: Freeleaps Payment service is down (instance {{ $labels.instance }}) + description: Freeleaps Payment service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsPaymentServiceHighErrorRate + expr: rate(http_requests_total{job="payment-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: payment-service + annotations: + summary: High error rate in freeleaps payment service (instance {{ $labels.instance }}) + description: Freeleaps Payment service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/payment/values.prod.yaml b/freeleaps/helm-pkg/payment/values.prod.yaml index 7603ebc6..12be6773 100644 --- a/freeleaps/helm-pkg/payment/values.prod.yaml +++ b/freeleaps/helm-pkg/payment/values.prod.yaml @@ -106,3 +106,30 @@ payment: controlledResources: - cpu - memory + prometheusRule: + name: freepeals-prod-payment + enabled: true + namespace: freeleaps-monitoring-system + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsPaymentServiceDown + expr: up{job="payment-service"} == 0 + for: 1m + labels: + severity: critical + service: payment-service + annotations: + summary: Freeleaps Payment service is down (instance {{ $labels.instance }}) + description: Freeleaps Payment service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsPaymentServiceHighErrorRate + expr: rate(http_requests_total{job="payment-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: payment-service + annotations: + summary: High error rate in freeleaps payment service (instance {{ $labels.instance }}) + description: Freeleaps Payment service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7