Prometheus 钉钉告警模板


Alertrules

groups:
- name: 'node running status'
  rules:
  - alert: 'Instance Down'
    expr: 'up == 0'
    for: 5s
    annotations:
      title: 'Instance Down'
      description: "{{ $labels.instance }}down"
    labels:
      robot: 'jcss'
      severity: 'warning'
      owner: 'xxxxxxxxxxx'

- name: 'node memory usage'
  rules:
  - alert: 'memory usage'
    expr: '((node_memory_MemTotal_bytes - node_memory_MemFree_bytes) / node_memory_MemTotal_bytes * 100)> 85'
    for: 5s
    annotations:
      title: 'Mem'
      description: '{{ $labels.instance }} Memusage {{ $value }}'
    labels: 
      robot: 'jcss'
      ops: 'true'
      severity: 'warning'
      owner: "xxxxxxxxxxx"

Alertmanager Router

route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 1s
  repeat_interval: 30s
  receiver: 'ops'
  routes:
  - match:
      ops: 'true'
    receiver: 'ops'
    continue: true
  - match:
      robot : 'jcss'
    receiver: 'jcss'
receivers:
- name: 'ops'
  webhook_configs:
  - url: 'http://notice.liyblog.com:8060/dingtalk/ops/send'
- name: 'jcss'
  webhook_configs:
  - url: 'http://notice.liyblog.com:8060/dingtalk/jcss/send'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

Prometheus-Webhook-Dingtalk

config.yml

templates:
  - contrib/templates/*.tmpl

targets:
  jcss:
    url: https://oapi.dingtalk.com/robot/send?access_token=
    secret: 
    mention:
      mobiles: ['xxxxxxxxxxx']

  ops:
    url: https://oapi.dingtalk.com/robot/send?access_token=
    secret: 
    message:
      title: '{{ template "ops.title" . }}'
      text: '{{ template "ops.content" . }}'

default.tmpl

{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}


{{ define "__alert_list" }}{{ range . }}
---
{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}

**告警名称**: {{ index .Annotations "title" }} 

**告警级别**: {{ .Labels.severity }} 

**告警主机**: {{ .Labels.instance }} 

**告警信息**: {{ index .Annotations "description" }}

**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}

{{ define "__resolved_list" }}{{ range . }}
---
{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}

**告警名称**: {{ index .Annotations "title" }}

**告警级别**: {{ .Labels.severity }}

**告警主机**: {{ .Labels.instance }}

**告警信息**: {{ index .Annotations "description" }}

**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}

**恢复时间**: {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}


{{ define "default.title" }}
{{ template "__subject" . }}
{{ end }}

{{ define "default.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====侦测到{{ .Alerts.Firing | len  }}个故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}

{{ if gt (len .Alerts.Resolved) 0 }}
**====恢复{{ .Alerts.Resolved | len  }}个故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}


{{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
{{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}
{{ template "default.title" . }}
{{ template "default.content" . }}

ops.tmpl

{{ define "__ops_alert_list" }}{{ range . }}
---
**告警名称**: {{ index .Annotations "title" }} 

**告警级别**: {{ .Labels.severity }} 

**告警主机**: {{ .Labels.instance }} 

**告警信息**: {{ index .Annotations "description" }}

**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}

{{ define "__ops_resolved_list" }}{{ range . }}
---
**告警名称**: {{ index .Annotations "title" }}

**告警级别**: {{ .Labels.severity }}

**告警主机**: {{ .Labels.instance }}

**告警信息**: {{ index .Annotations "description" }}

**告警时间**: {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}

**恢复时间**: {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
{{ end }}{{ end }}

{{ define "ops.title" }}
{{ template "__subject" . }}
{{ end }}

{{ define "ops.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====侦测到{{ .Alerts.Firing | len  }}个故障====**
{{ template "__ops_alert_list" .Alerts.Firing }}
---
{{ end }}

{{ if gt (len .Alerts.Resolved) 0 }}
**====恢复{{ .Alerts.Resolved | len  }}个故障====**
{{ template "__ops_resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}

参考资料

Prometheus 官网
作者【SoulChild随笔记】的alertmanager自定义告警模板