作者:尹正杰
版权声明:原创作品,谢绝转载!否则将追究法律责任。
一.配置钉钉
1.钉钉自定义机器人接入参考文档
参考链接:https://open.dingtalk.com/document/orgapp/custom-robot-access
2.配置钉钉机器人
如上图所示,注册钉钉拉群聊,然后添加“自定义机器人”,并配置自定义关键词,该关键字的作用就是当消息发送到钉钉群前,会检查该消息是否包含关键字,若包含则会在群里显示,若不包含关键字则不会发送消息到群里,这意味着群成员无法查看。这相当于一个"口令",一定要确保你的消息确保有该关键词。
二.alertmanager实现告警并发送钉钉
1.编写源代码接收告警处理
package mainimport ("bytes""encoding/json""errors""flag""fmt""log""net/http""time""github.com/gin-gonic/gin""github.com/prometheus/alertmanager/notify/webhook""github.com/prometheus/alertmanager/template"
)// DingMessage 定义消息结构体,最终字段结构需要符合官方的定义,参考链接:
//
// https://open.dingtalk.com/document/orgapp/custom-robot-access#title-72m-8ag-pqw
type DingMessage struct {Msgtype string `json:"msgtype"`Text struct {Content string `json:"content"`} `json:"text"`At struct {AtMobiles []string `json:"atMobiles"`} `json:"at"`
}// buildDingDingContent 拼接钉钉消息的函数
func buildDingDingContent(msg template.Alert) ([]byte, error) {recM := map[string]string{"firing": "已触发", "resolved": "已恢复"}msgTpl := fmt.Sprintf("规则名称: %s\n"+"是否已经恢复: %s\n"+"告警级别: %s\n"+"触发时间: %s\n"+"看图连接: %s\n"+"当前值: %s\n"+"标签组: %s",msg.Labels["alertname"],recM[msg.Status],msg.Labels["severity"],msg.StartsAt.In(time.Local).Format("2006-01-02 15:04:05"),msg.GeneratorURL,msg.Annotations["value"],msg.Labels.SortedPairs(),)dm := DingMessage{Msgtype: "text"}dm.Text.Content = msgTplbs, err := json.Marshal(dm)return bs, err}// SendToDing 发送消息到钉钉
func SendToDing(jsonByte []byte) {// 注意哈,这个口令换成你的钉钉群机器人的webhook即可。dingDingUrl := "https://oapi.dingtalk.com/robot/send?access_token=7cd79af7b9a29f1877438f53f4be10e6f744bb0dae39bb256eee518bebe46cd5"req, err := http.NewRequest("POST", dingDingUrl, bytes.NewBuffer(jsonByte))if err != nil {log.Printf("err = %v\n", err)return}req.Header.Set("Content-Type", "application/json")client := &http.Client{}resp, err := client.Do(req)log.Printf("钉钉发送完成")if err != nil {log.Printf("http.post.request.err|url:%v|err:%v", dingDingUrl, err)return}defer resp.Body.Close()
}func AlertReceiveFunc(c *gin.Context) {var msg webhook.Messageif err := c.BindJSON(&msg); err != nil {c.JSON(400, errors.New("invalid args"))return}baseMsg := fmt.Sprintf("状态: %s|报警条数: %d", msg.Status, len(msg.Alerts))log.Printf("alertReceive|baseMsg:%+v", baseMsg)for i := 0; i < len(msg.Alerts); i++ {alert := msg.Alerts[i]bs, _ := buildDingDingContent(alert)log.Printf("detail|%d/%d|alert:%+v", i+1, len(msg.Alerts), alert)SendToDing(bs)}c.JSON(http.StatusOK, "ok")
}func main() {listenAddr := flag.String("addr", ":8888", "WebUI expose port")flag.Parse()r := gin.Default()r.POST("/alert", AlertReceiveFunc)r.Run(*listenAddr)
}
2.修改alertmanager配置文件
[root@prometheus-server31 ~]# cat /yinzhengjie/softwares/alertmanager-0.27.0.linux-amd64/alertmanager.yml
route:group_by: ['alertname']group_wait: 1sgroup_interval: 3srepeat_interval: 10sreceiver: 'web.hook'
receivers:- name: 'web.hook'webhook_configs:- url: 'http://10.0.0.1:8888/alert'http_config: {}max_alerts: 0send_resolved: true
inhibit_rules:- source_match:severity: 'critical'target_match:severity: 'warning'equal: ['alertname', 'dev', 'instance']
[root@prometheus-server31 ~]#
3.配置Prometheus的规则文件
[root@prometheus-server31 ~]# cat /yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/yinzhengjie_rules.yml
groups:
- name: xixirules:- alert: yinzhengjie_mysqld_exporter-alertexpr: node_boot_time_seconds{instance="10.0.0.42:9100", job="yinzhengjie_mysqld_exporter"} > 0labels:severity: criticalblog: "https://www.cnblogs.com/yinzhengjie"annotations:summary: DBA机器异常- name: haharules:- alert: yinzhengjie_bigdata_exporter-alertexpr: node_boot_time_seconds{instance="10.0.0.41:9100", job="yinzhengjie_bigdata_exporter"} > 0labels:severity: warningauther: 尹正杰annotations:summary: 大数据集群机器异常
[root@prometheus-server31 ~]#
4.重启服务,观察配置是否触发告警
1.修改Prometheus的启动脚本并添加如下的启动参数
--web.external-url=http://10.0.0.31:9090/[root@prometheus-server31 ~]# systemctl daemon-reload 2.重启服务
[root@prometheus-server31 ~]# systemctl restart prometheus-server.service alertmanager.service 如上图所示,我的测试代码已经成功发送告警啦。
5.验证钉钉配置是否生效
如上图所示,我们的配置告警信息成功啦。