16、prometheus + grafana + alertmanager
k8s 手撕方式安装 prometheus + grafana + alertmanager
k8s版本:k8s-1.29.1
prometheus + grafana + alertmanager 监控报警
1、k8s 手撕方式安装 prometheus
mkdir ~/prometheus-ymlkubectl create ns monitoring
cat > ~/prometheus-yml/prometheus-rbac.yml << 'EOF'
apiVersion: v1
kind: ServiceAccount
metadata:name: prometheusnamespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:name: prometheus
rules:
- apiGroups: [""]resources:- nodes- nodes/metrics- services- endpoints- podsverbs: ["get", "list", "watch"]
- apiGroups: [""]resources:- configmapsverbs: ["get"]
- apiGroups:- networking.k8s.ioresources:- ingressesverbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:name: prometheus
roleRef:apiGroup: rbac.authorization.k8s.iokind: ClusterRolename: prometheus
subjects:
- kind: ServiceAccountname: prometheusnamespace: monitoring
EOF
kubectl apply -f ~/prometheus-yml/prometheus-rbac.yml
cat > ~/prometheus-yml/prometheus-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:name: prometheus-confignamespace: monitoring
data:prometheus.yml: |global:scrape_interval: 15sevaluation_interval: 15sscrape_configs:- job_name: prometheusstatic_configs:- targets: ['localhost:9090']
EOF
kubectl apply -f ~/prometheus-yml/prometheus-ConfigMap.yml
这里暂时只配置了对 prometheus 本身的监控
如果以后有新的资源需要被监控,只需要将 ConfigMap 对象更新即可
cat > ~/prometheus-yml/prometheus-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:name: prometheus-confignamespace: monitoring
data:prometheus.yml: |global:scrape_interval: 15sevaluation_interval: 15s# 告警规则文件rule_files:- /etc/prometheus/rules.yml- /etc/prometheus/rules/*.rules.yml# 对接alertmanageralerting:alertmanagers:- static_configs:- targets: ["alertmanager-service.monitoring.svc.cluster.local:9093"]scrape_configs:#0、监控 prometheus- job_name: prometheusstatic_configs:- targets: ['localhost:9090']- job_name: 1.15.172.119static_configs:- targets: ['1.15.172.119:9100']#1、监控 k8s节点- job_name: 'k8s-nodes'kubernetes_sd_configs:- role: noderelabel_configs:- source_labels: [__address__]regex: '(.*):10250'replacement: '${1}:9100'target_label: __address__action: replace- action: labelmapregex: __meta_kubernetes_node_label_(.+)#2、监控 k8s-etcd- job_name: 'k8s-etcd'metrics_path: metricsscheme: httptls_config:ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crtbearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/tokenkubernetes_sd_configs:- role: endpointsrelabel_configs:- source_labels: [__meta_kubernetes_service_name]regex: etcd-k8saction: keep- action: labelmapregex: __meta_kubernetes_pod_label_(.+)#3、监控 kube-apiserver- job_name: 'kube-apiserver'kubernetes_sd_configs:- role: endpointsscheme: httpstls_config:ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crtbearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/tokenrelabel_configs:- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]action: keepregex: default;kubernetes;https#4、监控 kube-controller-manager- job_name: 'kube-controller-manager'kubernetes_sd_configs:- role: endpointsscheme: httpsbearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/tokentls_config:ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crtinsecure_skip_verify: truerelabel_configs:- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name]action: keepregex: kube-system;kube-controller-manager#5、监控 kube-scheduler- job_name: 'kube-scheduler'kubernetes_sd_configs:- role: endpointsscheme: httpsbearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/tokentls_config:ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crtinsecure_skip_verify: truerelabel_configs:- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name]action: keepregex: kube-system;kube-scheduler#6、监控 kubelet- job_name: 'kubelet'kubernetes_sd_configs:- role: nodescheme: httpstls_config:ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crtinsecure_skip_verify: truebearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/tokenrelabel_configs:- action: labelmapregex: __meta_kubernetes_node_label_(.+)replacement: $1#7、监控 kube-proxy- job_name: 'kube-proxy'metrics_path: metricsscheme: httptls_config:ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crtinsecure_skip_verify: falsebearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/tokenkubernetes_sd_configs:- role: endpointsrelabel_configs:- source_labels: [__meta_kubernetes_service_name]regex: kube-proxyaction: keep- action: labelmapregex: __meta_kubernetes_pod_label_(.+)#8、监控 coredns- job_name: 'coredns'static_configs:- targets: ['kube-dns.kube-system.svc.cluster.local:9153']#9、监控容器- job_name: 'kubernetes-cadvisor'kubernetes_sd_configs:- role: nodescheme: httpstls_config:ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crtinsecure_skip_verify: truebearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/tokenrelabel_configs:- action: labelmapregex: __meta_kubernetes_node_label_(.+)replacement: $1- source_labels: [__meta_kubernetes_node_name]regex: (.+)replacement: /metrics/cadvisortarget_label: __metrics_path__#10、svc自动发现- job_name: 'k8s-service-endpoints'kubernetes_sd_configs:- role: endpointsrelabel_configs:- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]action: keepregex: true- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]action: replacetarget_label: __scheme__regex: (https?)- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]action: replacetarget_label: __metrics_path__regex: (.+)- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]action: replacetarget_label: __address__regex: ([^:]+)(?::\d+)?;(\d+)replacement: $1:$2- action: labelmapregex: __meta_kubernetes_service_label_(.+)- source_labels: [__meta_kubernetes_namespace]action: replacetarget_label: kubernetes_namespace- source_labels: [__meta_kubernetes_service_name]action: replacetarget_label: kubernetes_name- source_labels: [__meta_kubernetes_pod_name]action: replacetarget_label: kubernetes_pod_name# 11、监控 kube-state-metrics- job_name: "kube-state-metrics"kubernetes_sd_configs:- role: endpointsrelabel_configs:- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_endpoints_name]regex: kube-system;kube-state-metricsaction: keep# 告警规则rules.yml: |groups:- name: test-node-memrules:- alert: NodeMemoryUsageexpr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 20for: 1mlabels:cluster: RTGseverity: P1annotations:summary: "{{$labels.instance}}: High Memory usage detected"description: "{{$labels.instance}}: Memory usage is above 20% (current value is: {{ $value }})"- name: Hosts.rulesrules:## Custom By huanghuanhui- alert: HostDownexpr: up == 0for: 1mlabels:cluster: RTGseverity: P1annotations:Summary: '主机{{ $labels.instance }} ${{ $labels.job }} down'description: "主机: 【{{ $labels.instance }}】has been down for more than 1 minute"- alert: HostCpuLoadAvageexpr: node_load5 /count by (instance, job) (node_cpu_seconds_total{mode="idle"}) >= 0.95for: 1mannotations:Summary: "主机{{ $labels.instance }} cpu 5分钟负载比率大于1 (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】 cpu_load5值大于核心数。 (当前比率值:{{ $value }})"labels:cluster: RTGseverity: 'P3'- alert: HostCpuUsageexpr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80for: 1mannotations:Summary: "主机{{ $labels.instance }} CPU 5分钟使用率大于80% (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU使用率超过80% (当前值:{{ $value }})"labels:cluster: RTGseverity: 'P1'- alert: HostMemoryUsageexpr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80for: 1mannotations:Summary: "主机{{ $labels.instance }} 内存使用率大于80% (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】 内存使用率超过80% (当前使用率:{{ $value }}%)"labels:cluster: RTGseverity: 'P3'- alert: HostIOWaitexpr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10for: 1mannotations:Summary: "主机{{ $labels.instance }} iowait大于10% (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘IO过高 (当前负载值:{{ $value }})"labels:cluster: RTGseverity: 'P3'- alert: HostFileSystemUsageexpr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }))*100 > 80for: 1mannotations:Summary: "主机{{ $labels.instance }} {{ $labels.mountpoint }} 磁盘空间使用大于80% (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区使用率超过80%, 当前值使用率:{{ $value }}%"labels:cluster: RTGseverity: 'P3'- alert: HostSwapIsFillingUpexpr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80for: 2mlabels:cluster: RTGseverity: 'P4'annotations:Summary: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"description: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"- alert: HostNetworkConnection-ESTABLISHEDexpr: sum(node_netstat_Tcp_CurrEstab) by (instance) > 2000for: 5mlabels:cluster: RTGseverity: 'P4'annotations:Summary: "主机{{ $labels.instance }} ESTABLISHED连接数过高 (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过2000, 当前ESTABLISHED连接数: {{ $value }}"- alert: HostNetworkConnection-TIME_WAITexpr: sum(node_sockstat_TCP_tw) by (instance) > 1000for: 5mlabels:cluster: RTGseverity: 'P3'annotations:Summary: "主机{{ $labels.instance }} TIME_WAIT连接数过高 (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过1000, 当前TIME_WAIT连接数: {{ $value }}"- alert: HostUnusualNetworkThroughputInexpr: sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"eth.*"}[2m])) / 1024 / 1024 > 300for: 5mlabels:cluster: RTGseverity: 'P3'annotations:Summary: "主机{{ $labels.instance }} 入口流量超过 (> 300 MB/s) (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 入口流量超过 (> 300 MB/s), 当前值: {{ $value }}"- alert: HostUnusualNetworkThroughputOutexpr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"eth.*"}[2m])) / 1024 / 1024 > 300for: 5mlabels:cluster: RTGseverity: 'P4'annotations:Summary: "主机{{ $labels.instance }} 出口流量超过 (> 300 MB/s) (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 出口流量超过 (> 300 MB/s), 当前值: {{ $value }}"- alert: HostUnusualDiskReadRateexpr: sum by (instance, device) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50for: 5mlabels:cluster: RTGseverity: 'P4'annotations:Summary: "主机{{ $labels.instance }} 磁盘读取速率超过(50 MB/s) (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 读取速度超过(50 MB/s), 当前值: {{ $value }}"- alert: HostUnusualDiskWriteRateexpr: sum by (instance, device) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50for: 2mlabels:cluster: RTGseverity: 'P4'annotations:Summary: "主机{{ $labels.instance }} 磁盘读写入率超过(50 MB/s) (当前值:{{ $value }})"description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 写入速度超过(50 MB/s), 当前值: {{ $value }}"- alert: HostOutOfInodesexpr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10for: 2mlabels:cluster: RTGseverity: 'P3'annotations:Summary: "主机{{ $labels.instance }} {{ $labels.mountpoint }}分区主机Inode值小于5% (当前值:{{ $value }}) "description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"- alert: HostUnusualDiskReadLatencyexpr: rate(node_disk_read_time_seconds_total[2m]) / rate(node_disk_reads_completed_total[2m]) * 1000 > 100 and rate(node_disk_reads_completed_total[2m]) > 0for: 5mlabels:cluster: RTGseverity: 'P4'annotations:Summary: "主机{{ $labels.instance }} 主机磁盘Read延迟大于100ms (当前值:{{ $value }}ms)"description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Read延迟过高 (read operations > 100ms), 当前延迟值: {{ $value }}ms"- alert: HostUnusualDiskWriteLatencyexpr: rate(node_disk_write_time_seconds_total[2m]) / rate(node_disk_writes_completed_total[2m]) * 1000 > 100 and rate(node_disk_writes_completed_total[2m]) > 0for: 5mlabels:cluster: RTGseverity: 'P4'annotations:Summary: "主机{{ $labels.instance }} 主机磁盘write延迟大于100ms (当前值:{{ $value }}ms)"description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Write延迟过高 (write operations > 100ms), 当前延迟值: {{ $value }}ms"- alert: NodeFilesystemFilesFillingUpannotations:description: '预计4小时后 分区:{{ $labels.device }} 主机:{{ $labels.instance }} 可用innode仅剩余 {{ printf "%.2f" $value }}%.'runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingupSummary: '主机{{ $labels.instance }} 预计4小时后可用innode数会低于15% (当前值:{{ $value }})'labels:cluster: RTGseverity: p3expr: |(node_filesystem_files_free{job="node-exporter|vm-node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter|vm-node-exporter",fstype!=""} * 100 < 15andpredict_linear(node_filesystem_files_free{job="node-exporter|vm-node-exporter",fstype!=""}[6h], 4*60*60) < 0andnode_filesystem_readonly{job="node-exporter|vm-node-exporter",fstype!=""} == 0)for: 1h- alert: NodeFileDescriptorLimitannotations:description: '主机:{{ $labels.instance }} 文件描述符使用率超过70% {{ printf "%.2f" $value }}%.'runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimitSummary: '主机: {{ $labels.instance }}文件描述符即将被耗尽. (当前值:{{ $value }})'expr: |(node_filefd_allocated{job="node-exporter|vm-node-exporter"} * 100 / node_filefd_maximum{job="node-exporter|vm-node-exporter"} > 70)for: 15mlabels:severity: p3action: monitorcluster: RTG- alert: NodeClockSkewDetectedannotations:description: '主机: {{ $labels.instance }} 时钟延时超过 300s.'runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetectedSummary: '主机: {{ $labels.instance }}时钟延时超过 300s.(当前值:{{ $value }})'expr: |(node_timex_offset_seconds > 0.05andderiv(node_timex_offset_seconds[5m]) >= 0)or(node_timex_offset_seconds < -0.05andderiv(node_timex_offset_seconds[5m]) <= 0)for: 10mlabels:severity: p3cluster: RTG- alert: NodeFilesystemFilesFillingUpannotations:description: '预计4小时后 分区:{{ $labels.device }} 主机:{{ $labels.instance }} 可用innode仅剩余 {{ printf "%.2f" $value }}%.'runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingupSummary: '主机{{ $labels.instance }} 预计4小时后可用innode数会低于15% (当前值:{{ $value }})'expr: |(node_filesystem_files_free{job="node-exporter|vm-node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter|vm-node-exporter",fstype!=""} * 100 < 15andpredict_linear(node_filesystem_files_free{job="node-exporter|vm-node-exporter",fstype!=""}[6h], 4*60*60) < 0andnode_filesystem_readonly{job="node-exporter|vm-node-exporter",fstype!=""} == 0)for: 1hlabels:severity: p3cluster: RTG- alert: NodeFilesystemSpaceFillingUpannotations:description: '主机: {{ $labels.instance }} 分区: {{ $labels.device }} 预计在4小时候只有 {{ printf "%.2f" $value }}%.'runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingupSummary: "主机: {{ $labels.instance }}预计4小时候磁盘空闲会低于15% (当前值:{{ $value }})"expr: |(node_filesystem_avail_bytes{job="node-exporter|vm-node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter|vm-node-exporter",fstype!=""} * 100 < 15andpredict_linear(node_filesystem_avail_bytes{job="node-exporter|vm-node-exporter",fstype!=""}[6h], 4*60*60) < 0andnode_filesystem_readonly{job="node-exporter|vm-node-exporter",fstype!=""} == 0)for: 1hlabels:severity: p3cluster: RTG- alert: NodeNetworkReceiveErrsannotations:description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered{{ printf "%.0f" $value }} receive errors in the last two minutes.'runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrsSummary: "主机{{ $labels.instance }} 网卡{{ $labels.device }} Node网络接受错误 (当前值:{{ $value }})"expr: |increase(node_network_receive_errs_total[2m]) > 10for: 2hlabels:severity: p3cluster: RTG- alert: NodeNetworkTransmitErrsannotations:description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered{{ printf "%.0f" $value }} transmit errors in the last two minutes.'runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrsSummary: "主机{{ $labels.instance }} 网卡{{ $labels.device }} Node网络传输错误 (当前值:{{ $value }})"expr: |increase(node_network_transmit_errs_total[2m]) > 10for: 1hlabels:severity: p3cluster: RTG- alert: NodeHighNumberConntrackEntriesUsedannotations:description: '{{ $value | humanizePercentage }} of conntrack entries are used.'runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesusedSummary: 主机{{ $labels.instance }} Conntrack条目使用率大于75% (当前值:{{ $value }})expr: |(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75labels:severity: p2cluster: RTG- alert: NodeTextFileCollectorScrapeErrorannotations:description: Node Exporter text file collector failed to scrape.runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerrorSummary: 主机{{ $labels.instance }} 打开或读取文件时出错,(当前值:{{ $value }})expr: |node_textfile_scrape_error{job="node-exporter|vm-node-exporter"} == 1labels:severity: p2cluster: RTG- alert: NodeClockNotSynchronisingannotations:message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTPis configured on this host.runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronisingSummary: 主机{{ $labels.instance }} 时间不同步(当前值:{{ $value }})expr: |min_over_time(node_timex_sync_status[5m]) == 0for: 10mlabels:severity: p4cluster: RTG
EOF
kubectl apply -f ~/prometheus-yml/prometheus-ConfigMap.ymlprometheus_podIP=`kubectl get pods -n monitoring -o custom-columns='NAME:metadata.name,podIP:status.podIPs[*].ip' |grep prometheus |awk '{print $2}'`curl -X POST "http://$prometheus_podIP:9090/-/reload"
# 因为告警规则是以ConfigMap挂载Prometheus上,为了可以后期可以方便加规则,这里先创建一个空的告警规则ConfigMap(目的:先让Prometheus正常启动)
kubectl create configmap prometheus-rules --from-literal=empty=empty
cat > ~/prometheus-yml/prometheus-Deployment.yml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:name: prometheusnamespace: monitoringlabels:app: prometheus
spec:replicas: 1selector:matchLabels:app: prometheustemplate:metadata:labels:app: prometheusspec:serviceAccountName: prometheuscontainers:- name: prometheusimage: prom/prometheus:v2.49.1imagePullPolicy: IfNotPresentargs:- "--config.file=/etc/prometheus/prometheus.yml"- "--storage.tsdb.path=/prometheus"- "--storage.tsdb.retention.time=30d"- "--web.enable-admin-api"- "--web.enable-lifecycle"ports:- containerPort: 9090name: httpvolumeMounts:- mountPath: "/prometheus"subPath: prometheusname: data- mountPath: "/etc/prometheus"name: config- mountPath: "/etc/prometheus/rules"name: rules- name: localtimemountPath: /etc/localtimeresources:limits:cpu: "2"memory: "4Gi"requests:cpu: "1"memory: "2Gi"volumes:- name: datapersistentVolumeClaim:claimName: prometheus-nfs-client-pvc- name: configconfigMap:name: prometheus-config- name: rulesconfigMap:name: prometheus-rules- name: localtimehostPath:path: /etc/localtime
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:name: prometheus-nfs-client-pvcnamespace: monitoring
spec:storageClassName: nfs-storageaccessModes: [ReadWriteOnce]resources:requests:storage: 2Ti
EOF
kubectl apply -f ~/prometheus-yml/prometheus-Deployment.yml
cat > ~/prometheus-yml/prometheus-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:name: prometheus-servicenamespace: monitoringlabels:app: prometheusannotations:prometheus.io/port: "9090"prometheus.io/scrape: "true"
spec:selector:app: prometheustype: NodePortports:- name: webport: 9090targetPort: httpnodePort: 31111
EOF
kubectl apply -f ~/prometheus-yml/prometheus-Service.yml
cat > ~/prometheus-yml/prometheus-Ingress.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:name: prometheus-ingressnamespace: monitoringannotations:nginx.ingress.kubernetes.io/ssl-redirect: 'true'nginx.ingress.kubernetes.io/proxy-body-size: '4G'
spec:ingressClassName: nginxrules:- host: prometheus.huanghuanhui.cloudhttp:paths:- path: /pathType: Prefixbackend:service:name: prometheus-serviceport:number: 9090tls:- hosts:- prometheus.huanghuanhui.cloudsecretName: prometheus-ingress-tls
EOF
kubectl create secret -n monitoring \
tls prometheus-ingress-tls \
--key=/root/ssl/huanghuanhui.cloud.key \
--cert=/root/ssl/huanghuanhui.cloud.crt
kubectl apply -f ~/prometheus-yml/prometheus-Ingress.yml
访问地址:prometheus.huanghuanhui.cloud
告警规则
更多告警规则查看:https://samber.github.io/awesome-prometheus-alerts/
mkdir -p ~/prometheus-yml/rules-yml
pod.rules
cat > ~/prometheus-yml/rules-yml/pod.rules.yml << 'EOF'
groups:
- name: pod.rulesrules:- alert: PodDownexpr: kube_pod_container_status_running != 1 for: 2slabels:severity: warningcluster: k8sannotations:summary: 'Container: {{ $labels.container }} down'description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} is not running'- alert: PodReadyexpr: kube_pod_container_status_ready != 1 for: 5m # Ready持续5分钟,说明启动有问题labels:severity: warningcluster: k8sannotations:summary: 'Container: {{ $labels.container }} ready'description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} always ready for 5 minitue'- alert: PodRestartexpr: changes(kube_pod_container_status_restarts_total[30m]) > 0 # 最近30分钟pod重启for: 2slabels:severity: warningcluster: k8sannotations:summary: 'Container: {{ $labels.container }} restart'description: 'namespace: {{ $labels.namespace }}, pod: {{ $labels.pod }} restart {{ $value }} times'- alert: PodFailedexpr: sum (kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0for: 5slabels:severity: error annotations:summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"- alert: PodPendingexpr: sum (kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0for: 1mlabels:severity: errorannotations:summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"- alert: PodErrImagePullexpr: sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1for: 1mlabels:severity: warningannotations:summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ErrImagePull (当前值: {{ $value }})"- alert: PodImagePullBackOffexpr: sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1for: 1mlabels:severity: warningannotations:summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ImagePullBackOff (当前值: {{ $value }})"- alert: PodCrashLoopBackOffexpr: sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1for: 1mlabels:severity: warningannotations:summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CrashLoopBackOff (当前值: {{ $value }})"- alert: PodCPUUsageexpr: sum by(pod, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) > 5for: 5mlabels:severity: warning annotations:summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于80% (当前值: {{ $value }})"- alert: PodMemoryUsageexpr: sum(container_memory_rss{image!=""}) by(pod, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod, namespace) * 100 != +inf > 80for: 5mlabels:severity: error annotations:summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于80% (当前值: {{ $value }})"- alert: PodStatusChange # Pod 状态异常变更警报expr: changes(kube_pod_status_phase[5m]) > 5for: 5mannotations:summary: "Pod 状态异常变更"description: "Pod {{ $labels.pod }} 的状态异常变更次数超过 5 次."- alert: ContainerCrash # Pod 容器崩溃警报expr: increase(container_cpu_cfs_throttled_seconds_total{container!="",pod!=""}[5m]) > 0for: 5mannotations:summary: "Pod 容器崩溃"description: "Pod {{ $labels.pod }} 中的容器发生崩溃."
EOF
svc.rules
cat > ~/prometheus-yml/rules-yml/svc.rules.yml << 'EOF'
groups:
- name: svc.rulesrules:- alert: ServiceDownexpr: avg_over_time(up[5m]) * 100 < 50annotations:description: The service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.summary: The service {{ $labels.job }} is not responding
EOF
pvc.rules
cat > ~/prometheus-yml/rules-yml/pvc.rules.yml << 'EOF'
groups:
- name: pvc.rulesrules:- alert: PersistentVolumeClaimLostexpr: sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Lost"}) == 1for: 2mlabels:severity: warningannotations:summary: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"- alert: PersistentVolumeClaimPendigexpr: sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Pendig"}) == 1for: 2mlabels:severity: warningannotations:summary: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pendig\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"- alert: HighPersistentVolumeUsage # PersistentVolume 使用率过高警报expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 > 90for: 5mannotations:summary: "PersistentVolume 使用率过高"description: "PersistentVolume {{ $labels.persistentvolume }} 的使用率超过 90%."- alert: HighPVUsageForPod # Pod 挂载的 PersistentVolume 使用率过高警报expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 > 90for: 5mannotations:summary: "Pod 挂载的 PersistentVolume 使用率过高"description: "Pod {{ $labels.pod }} 挂载的 PersistentVolume 使用率超过 90%."
EOF
kubeadm.rules
cat > ~/prometheus-yml/rules-yml/kubeadm.rules.yml << 'EOF'
groups:
- name: kubeadm.rulesrules:# Kubelet 健康状态检查- alert: KubeletDownexpr: up{job="kubelet"} == 0for: 1mannotations:summary: "Kubelet 不可用"description: "Kubelet {{ $labels.instance }} 不可用."# Node 不可用警报:- alert: NodeDownexpr: up{job="k8s-nodes"} == 0for: 1mannotations:summary: "Node 不可用"description: "Node {{ $labels.node }} 不可用."# Kube Proxy 健康状态检查- alert: KubeProxyDownexpr: up{job="kube-proxy"} == 0for: 1mannotations:summary: "Kube Proxy 不可用"description: "Kube Proxy {{ $labels.instance }} 不可用."# Kube Scheduler 健康状态检查- alert: KubeSchedulerDownexpr: up{job="kube-scheduler"} == 0for: 1mannotations:summary: "Kube Scheduler 不可用"description: "Kube Scheduler 不可用."# Kube Controller Manager 健康状态检查- alert: KubeControllerManagerDownexpr: up{job="kube-controller-manager"} == 0for: 1mannotations:summary: "Kube Controller Manager 不可用"description: "Kube Controller Manager 不可用."# Kube State Metrics 健康状态检查- alert: KubeStateMetricsDownexpr: up{job="kube-state-metrics"} == 0for: 1mannotations:summary: "Kube State Metrics 不可用"description: "Kube State Metrics 不可用."# KubernetesNodeNotReady- alert: KubernetesNodeNotReadyexpr: sum(kube_node_status_condition{condition="Ready",status="true"}) by (node) == 0for: 10mlabels:severity: criticalannotations:summary: Kubernetes node is not readydescription: A node in the cluster is not ready, which may cause issues with cluster functionality.
EOF
# 更新前面创建空的prometheus-rules的ConfigMap
kubectl create configmap prometheus-rules \
--from-file=pod.rules.yml \
--from-file=svc.rules.yml \
--from-file=pvc.rules.yml \
--from-file=kubeadm.rules.yml \
-o yaml --dry-run=client | kubectl apply -f -
prometheus_podIP=`kubectl get pods -n monitoring -o custom-columns='NAME:metadata.name,podIP:status.podIPs[*].ip' |grep prometheus |awk '{print $2}'`curl -X POST "http://$prometheus_podIP:9090/-/reload"
0、对k8s-node
的监控
cat > ~/prometheus-yml/node-exporter.yml << 'EOF'
apiVersion: apps/v1
kind: DaemonSet
metadata:name: node-exporternamespace: monitoringlabels:app: node-exporter
spec:selector:matchLabels:app: node-exportertemplate:metadata:labels:app: node-exporterspec:hostPID: truehostIPC: truehostNetwork: truenodeSelector:kubernetes.io/os: linuxcontainers:- name: node-exporterimage: prom/node-exporter:v1.7.0args:- --web.listen-address=$(HOSTIP):9100- --path.procfs=/host/proc- --path.sysfs=/host/sys- --path.rootfs=/host/root- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ports:- containerPort: 9100env:- name: HOSTIPvalueFrom:fieldRef:fieldPath: status.hostIPresources:requests:cpu: 150mmemory: 180Milimits:cpu: 150mmemory: 180MisecurityContext:runAsNonRoot: truerunAsUser: 65534volumeMounts:- name: procmountPath: /host/proc- name: sysmountPath: /host/sys- name: rootmountPath: /host/rootmountPropagation: HostToContainerreadOnly: true- name: localtimemountPath: /etc/localtimetolerations:- operator: "Exists"volumes:- name: prochostPath:path: /proc- name: devhostPath:path: /dev- name: syshostPath:path: /sys- name: roothostPath:path: /- name: localtimehostPath:path: /etc/localtime
EOF
kubectl apply -f ~/prometheus-yml/node-exporter.yml
docker run -d \
--name node-exporter \
--restart=always \
--net="host" \
--pid="host" \
-v "/proc:/host/proc:ro" \
-v "/sys:/host/sys:ro" \
-v "/:/rootfs:ro" \
-e TZ=Asia/Shanghai \
-v /etc/localtime:/etc/localtime \
prom/node-exporter:v1.7.0 \
--path.procfs=/host/proc \
--path.rootfs=/rootfs \
--path.sysfs=/host/sys \
--collector.filesystem.ignored-mount-points='^/(sys|proc|dev|host|etc)($$|/)'
模版:8919、12159
方式1:
手动配置 node-exporter
# prometheus-ConfigMap.yml- job_name: 192.168.1.100static_configs:- targets: ['192.168.1.100:9100']
方式2:
基于 consul 自动发现 node-exporter
mkdir -p ~/prometheus-yml/consul-yml
cat > ~/prometheus-yml/consul-yml/consul.yaml << 'EOF'
---
apiVersion: v1
kind: Service
metadata:name: consul-servernamespace: monitoringlabels:name: consul-server
spec:selector:name: consul-serverports:- name: httpport: 8500targetPort: 8500- name: httpsport: 8443targetPort: 8443- name: rpcport: 8400targetPort: 8400- name: serf-lan-tcpprotocol: "TCP"port: 8301targetPort: 8301- name: serf-lan-udpprotocol: "UDP"port: 8301targetPort: 8301- name: serf-wan-tcpprotocol: "TCP"port: 8302targetPort: 8302- name: serf-wan-udpprotocol: "UDP"port: 8302targetPort: 8302- name: serverport: 8300targetPort: 8300- name: consul-dnsport: 8600targetPort: 8600
---
apiVersion: v1
kind: Service
metadata:name: consul-server-httpnamespace: monitoring
spec:selector:name: consul-servertype: NodePortports:- protocol: TCPport: 8500targetPort: 8500nodePort: 32685name: consul-server-tcp
---
apiVersion: apps/v1
kind: StatefulSet
metadata:name: consul-servernamespace: monitoringlabels:name: consul-server
spec:serviceName: consul-serverselector:matchLabels:name: consul-serverreplicas: 3template:metadata:labels:name: consul-serverannotations:prometheus.io/scrape: "true" # prometueus自动发现标签prometheus.io/path: "v1/agent/metrics" # consul的metrics路径prometheus.io/port: "8500"spec:affinity:podAntiAffinity:requiredDuringSchedulingIgnoredDuringExecution:- labelSelector:matchExpressions:- key: "name"operator: Invalues:- consul-servertopologyKey: "kubernetes.io/hostname"terminationGracePeriodSeconds: 10containers:- name: consulimage: ccr.ccs.tencentyun.com/huanghuanhui/consul:1.15.4imagePullPolicy: IfNotPresentargs:- "agent"- "-server"- "-bootstrap-expect=3"- "-ui"- "-data-dir=/consul/data"- "-bind=0.0.0.0"- "-client=0.0.0.0"- "-advertise=$(POD_IP)"- "-retry-join=consul-server-0.consul-server.$(NAMESPACE).svc.cluster.local"- "-retry-join=consul-server-1.consul-server.$(NAMESPACE).svc.cluster.local"- "-retry-join=consul-server-2.consul-server.$(NAMESPACE).svc.cluster.local"- "-domain=cluster.local"- "-disable-host-node-id"volumeMounts:- name: consul-nfs-client-pvcmountPath: /consul/data- name: localtimemountPath: /etc/localtimeenv:- name: POD_IPvalueFrom:fieldRef:fieldPath: status.podIP- name: NAMESPACEvalueFrom:fieldRef:fieldPath: metadata.namespaceports:- containerPort: 8500name: http- containerPort: 8400name: rpc- containerPort: 8443name: https-port- containerPort: 8301name: serf-lan- containerPort: 8302name: serf-wan- containerPort: 8600name: consul-dns- containerPort: 8300 name: servervolumes:- name: localtimehostPath:path: /etc/localtimevolumeClaimTemplates:- metadata:name: consul-nfs-client-pvcspec:accessModes: ["ReadWriteOnce"]storageClassName: nfs-storageresources:requests:storage: 20Gi
EOF
kubectl apply -f ~/prometheus-yml/consul-yml/consul.yaml
cat > ~/prometheus-yml/consul-yml/consul-Ingress.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:name: consul-ingressnamespace: monitoringannotations:nginx.ingress.kubernetes.io/ssl-redirect: 'true'nginx.ingress.kubernetes.io/proxy-body-size: '4G'
spec:ingressClassName: nginxrules:- host: consul.huanghuanhui.cloudhttp:paths:- path: /pathType: Prefixbackend:service:name: consul-serverport:number: 8500tls:- hosts:- consul.huanghuanhui.cloudsecretName: consul-ingress-tls
EOF
kubectl create secret -n monitoring \
tls consul-ingress-tls \
--key=/root/ssl/huanghuanhui.cloud.key \
--cert=/root/ssl/huanghuanhui.cloud.crt
kubectl apply -f ~/prometheus-yml/consul-yml/consul-Ingress.yml
ip访问地址:192.168.1.201:32685
域名访问地址:consul.huanghuanhui.cloud
# prometheus-ConfigMap.yml- job_name: 'consul-prometheus'consul_sd_configs:- server: 'consul-server-http.monitoring.svc.cluster.local:8500'relabel_configs:- source_labels: [__meta_consul_service_id]regex: (.+)target_label: 'node_name'replacement: '$1'- source_labels: [__meta_consul_service]regex: '.*(node-exporter|hosts).*'action: keep
# 服务注册(ip)
curl -X PUT -d '{"id": "1.15.172.119-node-exporter","name": "1.15.172.119-node-exporter","address": "1.15.172.119","port": 9100,"checks": [{"http": "http://1.15.172.119:9100/","interval": "5s"}]}' http://192.168.1.201:32685/v1/agent/service/registercurl -X PUT -d '{"id": "192.168.1.200-node-exporter","name": "192.168.1.200-node-exporter","address": "192.168.1.200","port": 9100,"checks": [{"http": "http://192.168.1.200:9100/","interval": "5s"}]}' http://192.168.1.201:32685/v1/agent/service/register# 服务注册(域名)
curl -X PUT -d '{"id": "1.15.172.119-node-exporter","name": "1.15.172.119-node-exporter","address": "1.15.172.119","port": 9100,"checks": [{"http": "http://1.15.172.119:9100/","interval": "5s"}]}' https://consul.huanghuanhui.cloud/v1/agent/service/registercurl -X PUT -d '{"id": "192.168.1.200-node-exporter","name": "192.168.1.200-node-exporter","address": "192.168.1.200","port": 9100,"checks": [{"http": "http://192.168.1.200:9100/","interval": "5s"}]}' https://consul.huanghuanhui.cloud/v1/agent/service/register
id
或者name
要包含node-exporter|hosts
标签才能自动发现
# 下线服务(ip)
curl -X PUT http://192.168.1.201:32685/v1/agent/service/deregister/1.15.172.119-node-exportercurl -X PUT http://192.168.1.201:32685/v1/agent/service/deregister/192.168.1.200-node-exporter# 下线服务(域名)
curl -X PUT https://consul.huanghuanhui.cloud/v1/agent/service/deregister/1.15.172.119-node-exportercurl -X PUT https://consul.huanghuanhui.cloud/v1/agent/service/deregister/192.168.1.200-node-exporter
consul 批量注册脚本
mkdir -p ~/prometheus-yml/consul-yml/node-exporter-jsoncat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-1.15.172.119.json << 'EOF'
{"id": "1.15.172.119-node-exporter","name": "1.15.172.119-node-exporter","address": "1.15.172.119","port": 9100,"tags": ["node-exporter"],"checks": [{"http": "http://1.15.172.119:9100/metrics","interval": "5s"}]
}
EOFcat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.201.json << 'EOF'
{"id": "192.168.1.201-node-exporter","name": "192.168.1.201-node-exporter","address": "192.168.1.201","port": 9100,"tags": ["node-exporter"],"checks": [{"http": "http://192.168.1.201:9100/metrics","interval": "5s"}]
}
EOFcat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.202.json << 'EOF'
{"id": "192.168.1.202-node-exporter","name": "192.168.1.202-node-exporter","address": "192.168.1.202","port": 9100,"tags": ["node-exporter"],"checks": [{"http": "http://192.168.1.202:9100/metrics","interval": "5s"}]
}
EOFcat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.203.json << 'EOF'
{"id": "192.168.1.203-node-exporter","name": "192.168.1.203-node-exporter","address": "192.168.1.203","port": 9100,"tags": ["node-exporter"],"checks": [{"http": "http://192.168.1.203:9100/metrics","interval": "5s"}]
}
EOFcat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.204.json << 'EOF'
{"id": "192.168.1.204-node-exporter","name": "192.168.1.204-node-exporter","address": "192.168.1.204","port": 9100,"tags": ["node-exporter"],"checks": [{"http": "http://192.168.1.204:9100/metrics","interval": "5s"}]
}
EOFcat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.200.json << 'EOF'
{"id": "192.168.1.200-node-exporter","name": "192.168.1.200-node-exporter","address": "192.168.1.200","port": 9100,"tags": ["node-exporter"],"checks": [{"http": "http://192.168.1.200:9100/metrics","interval": "5s"}]
}
EOF# 添加更多 JSON 文件,每个文件包含一个服务的信息
# 批量注册脚本
cat > ~/prometheus-yml/consul-yml/node-exporter-json/register-service.sh << 'EOF'
#!/bin/bashCONSUL_API="https://consul.huanghuanhui.cloud/v1/agent/service/register"declare -a SERVICES=("node-exporter-1.15.172.119.json""node-exporter-192.168.1.201.json""node-exporter-192.168.1.202.json""node-exporter-192.168.1.203.json""node-exporter-192.168.1.204.json""node-exporter-192.168.1.200.json"# 添加更多 JSON 文件,每个文件包含一个服务的信息
)for SERVICE_FILE in "${SERVICES[@]}"; docurl -X PUT --data @"$SERVICE_FILE" "$CONSUL_API"
done
EOF
# 批量下线脚本
cat > ~/prometheus-yml/consul-yml/node-exporter-json/deregister-service.sh << 'EOF'
#!/bin/bashCONSUL_API="https://consul.huanghuanhui.cloud/v1/agent/service/deregister"declare -a SERVICES=("node-exporter-1.15.172.119.json""node-exporter-192.168.1.201.json""node-exporter-192.168.1.202.json""node-exporter-192.168.1.203.json""node-exporter-192.168.1.204.json""node-exporter-192.168.1.200.json"# 添加更多 JSON 文件,每个文件包含一个服务的信息
)for SERVICE_FILE in "${SERVICES[@]}"; doSERVICE_ID=$(jq -r .id "$SERVICE_FILE")curl -X PUT "$CONSUL_API/$SERVICE_ID"
done
EOF
mkdir -p ~/prometheus-yml/kube-yml
1、对kube-controller-manager
的监控
sed -i 's/bind-address=127.0.0.1/bind-address=0.0.0.0/g' /etc/kubernetes/manifests/kube-controller-manager.yaml
cat > ~/prometheus-yml/kube-yml/prometheus-kube-controller-manager-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:namespace: kube-systemname: kube-controller-managerlabels:app.kubernetes.io/name: kube-controller-manager
spec:selector:component: kube-controller-managerports:- name: https-metricsport: 10257targetPort: 10257
EOF
kubectl apply -f ~/prometheus-yml/kube-yml/prometheus-kube-controller-manager-Service.yml
2、对kube-scheduler
的监控
sed -i 's/bind-address=127.0.0.1/bind-address=0.0.0.0/g' /etc/kubernetes/manifests/kube-scheduler.yaml
cat > ~/prometheus-yml/kube-yml/prometheus-kube-scheduler-Service.yml << EOF
apiVersion: v1
kind: Service
metadata:namespace: kube-systemname: kube-schedulerlabels:app.kubernetes.io/name: kube-scheduler
spec:selector:component: kube-schedulerports:- name: https-metricsport: 10259 targetPort: 10259
EOF
kubectl apply -f ~/prometheus-yml/kube-yml/prometheus-kube-scheduler-Service.yml
3、对kube-proxy
的监控
kubectl get configmap kube-proxy -n kube-system -o yaml | \
sed -e 's/metricsBindAddress: ""/metricsBindAddress: "0.0.0.0:10249"/' | \
kubectl diff -f - -n kube-systemkubectl get configmap kube-proxy -n kube-system -o yaml | \
sed -e 's/metricsBindAddress: ""/metricsBindAddress: "0.0.0.0:10249"/' | \
kubectl apply -f - -n kube-system
kubectl rollout restart daemonset kube-proxy -n kube-system
netstat -tnlp |grep kube-proxynetstat -antp|grep 10249
cat > ~/prometheus-yml/kube-yml/prometheus-kube-proxy-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:name: kube-proxynamespace: kube-systemlabels:k8s-app: kube-proxy
spec:selector:k8s-app: kube-proxyports:- name: https-metricsport: 10249targetPort: 10249protocol: TCP
EOF
kubectl apply -f ~/prometheus-yml/kube-yml/prometheus-kube-proxy-Service.yml
4、对k8s-etcd
的监控
sed -i 's/127.0.0.1:2381/0.0.0.0:2381/g' /etc/kubernetes/manifests/etcd.yaml
cat > ~/prometheus-yml/kube-yml/etcd-k8s-master-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:name: etcd-k8snamespace: kube-systemlabels:k8s-app: etcd
spec:type: ClusterIPclusterIP: Noneports:- name: portport: 2381
---
apiVersion: v1
kind: Endpoints
metadata:name: etcd-k8snamespace: kube-systemlabels:k8s-app: etcd
subsets:
- addresses:- ip: 192.168.1.200nodeName: k8s-01ports:- name: portport: 2381
EOF
kubectl apply -f ~/prometheus-yml/kube-yml/etcd-k8s-master-Service.yml
https://grafana.com/grafana/dashboards/9733-etcd-for-k8s-cn/
模版:9733
2、k8s 手撕方式安装 grafana
cat > ~/prometheus-yml/grafana-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:name: grafana-confignamespace: monitoring
data:grafana.ini: |[smtp]enabled = falsehost = localhost:25user =password =skip_verify = falsefrom_address = admin@grafana.localhostfrom_name = Grafana[alerting]enabled =execute_alerts = true
EOF
kubectl apply -f ~/prometheus-yml/grafana-ConfigMap.yml
cat > ~/prometheus-yml/grafana-Deployment.yml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:name: grafananamespace: monitoringlabels:app: grafana
spec:selector:matchLabels:app: grafanatemplate:metadata:labels:app: grafanaspec:securityContext:fsGroup: 472supplementalGroups:- 0containers:- name: grafanaimage: grafana/grafana:10.2.3imagePullPolicy: IfNotPresentports:- containerPort: 3000name: http-grafanaprotocol: TCPenv:- name: TZvalue: Asia/Shanghai- name: GF_SECURITY_ADMIN_USERvalue: admin- name: GF_SECURITY_ADMIN_PASSWORDvalue: Admin@2024readinessProbe:failureThreshold: 3httpGet:path: /robots.txtport: 3000scheme: HTTPinitialDelaySeconds: 10periodSeconds: 30successThreshold: 1timeoutSeconds: 2livenessProbe:failureThreshold: 3initialDelaySeconds: 30periodSeconds: 10successThreshold: 1tcpSocket:port: 3000timeoutSeconds: 1resources:limits:cpu: "1"memory: "2Gi"requests:cpu: "0.5"memory: "1Gi"volumeMounts:- mountPath: /var/lib/grafananame: grafana-data- mountPath: /etc/grafananame: configvolumes:- name: grafana-datapersistentVolumeClaim:claimName: grafana-nfs-client-pvc- name: configconfigMap:name: grafana-config
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:name: grafana-nfs-client-pvcnamespace: monitoring
spec:storageClassName: nfs-storageaccessModes: [ReadWriteOnce]resources:requests:storage: 2Ti
---
apiVersion: v1
kind: Service
metadata:name: grafana-servicenamespace: monitoringlabels:app: grafana
spec:type: NodePortports:- nodePort: 31300port: 3000selector:app: grafana
EOF
kubectl apply -f ~/prometheus-yml/grafana-Deployment.yml
cat > ~/prometheus-yml/grafana-Ingress.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:name: grafana-ingressnamespace: monitoringannotations:nginx.ingress.kubernetes.io/ssl-redirect: 'true'nginx.ingress.kubernetes.io/proxy-body-size: '4G'
spec:ingressClassName: nginxrules:- host: grafana.huanghuanhui.cloudhttp:paths:- path: /pathType: Prefixbackend:service:name: grafana-serviceport:number: 3000tls:- hosts:- grafana.huanghuanhui.cloudsecretName: grafana-ingress-tls
EOF
kubectl create secret -n monitoring \
tls grafana-ingress-tls \
--key=/root/ssl/huanghuanhui.cloud.key \
--cert=/root/ssl/huanghuanhui.cloud.crt
kubectl apply -f ~/prometheus-yml/grafana-Ingress.yml
访问地址:grafana.huanghuanhui.cloud
账号密码:admin、Admin@2024
https://grafana.com/grafana/dashboards/
模版:8919、12159、13105、9276、12006
3、k8s 手撕方式安装 alertmanager
与qq邮箱集成
cat > ~/prometheus-yml/alertmanager-qq-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:name: alertmanager-confignamespace: monitoring
data:alertmanager.yml: |-global:resolve_timeout: 5msmtp_smarthost: 'smtp.qq.com:465'smtp_from: '1308470940@qq.com'smtp_auth_username: '1308470940@qq.com'smtp_auth_password: 'kgwsqpzsvhxvjjii'smtp_hello: 'qq.com'smtp_require_tls: falseroute:group_by: ['alertname', 'cluster']group_wait: 30sgroup_interval: 5mrepeat_interval: 5mreceiver: defaultroutes:- receiver: emailgroup_wait: 10smatch:team: nodetemplates:- '/etc/config/template/email.tmpl'receivers:- name: 'default'email_configs:- to: '1308470940@qq.com'html: '{{ template "email.html" . }}'headers: { Subject: "[WARN] Prometheus 告警邮件" }- name: 'email'email_configs:- to: '1308470940@qq.com'send_resolved: true
EOF
与钉钉集成(为例)
cat > ~/prometheus-yml/alertmanager-webhook-dingtalk-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:name: alertmanager-confignamespace: monitoring
data:alertmanager.yml: |-global:resolve_timeout: 5mroute:receiver: webhookgroup_wait: 30sgroup_interval: 5mrepeat_interval: 4hgroup_by: [alertname]routes:- receiver: webhookgroup_wait: 10smatch:team: nodereceivers:- name: webhookwebhook_configs:- url: 'http://alertmanager-webhook-dingtalk.monitoring.svc.cluster.local:8060/dingtalk/webhook1/send'send_resolved: true
EOF
kubectl apply -f ~/prometheus-yml/alertmanager-webhook-dingtalk-ConfigMap.yml
cat > ~/prometheus-yml/alertmanager-Deployment.yaml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:name: alertmanagernamespace: monitoring
spec:replicas: 1selector:matchLabels:app: alertmanagertemplate:metadata:labels:app: alertmanagerspec:containers:- name: alertmanagerimage: prom/alertmanager:v0.26.0ports:- containerPort: 9093name: httpvolumeMounts:- name: alertmanager-configmountPath: /etc/alertmanager- name: alertmanager-datamountPath: /alertmanager- name: localtimemountPath: /etc/localtimecommand:- "/bin/alertmanager"- "--config.file=/etc/alertmanager/alertmanager.yml"- "--storage.path=/alertmanager"volumes:- name: alertmanager-configconfigMap:name: alertmanager-config- name: alertmanager-datapersistentVolumeClaim:claimName: alertmanager-nfs-client-pvc- name: localtimehostPath:path: /etc/localtime---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:name: alertmanager-nfs-client-pvcnamespace: monitoring
spec:storageClassName: nfs-storageaccessModes:- ReadWriteOnceresources:requests:storage: "20Gi"
EOF
kubectl apply -f ~/prometheus-yml/alertmanager-Deployment.yaml
cat > ~/prometheus-yml/alertmanager-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:name: alertmanager-servicenamespace: monitoring
spec:selector:app: alertmanagertype: NodePortports:- name: webport: 9093targetPort: httpnodePort: 30093
EOF
kubectl apply -f ~/prometheus-yml/alertmanager-Service.yml
cat > ~/prometheus-yml/alertmanager-Ingress.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:name: alertmanager-ingressnamespace: monitoringannotations:nginx.ingress.kubernetes.io/ssl-redirect: 'true'nginx.ingress.kubernetes.io/proxy-body-size: '4G'
spec:ingressClassName: nginxrules:- host: alertmanager.huanghuanhui.cloudhttp:paths:- path: /pathType: Prefixbackend:service:name: alertmanager-serviceport:number: 9093tls:- hosts:- prometheus.huanghuanhui.cloudsecretName: alertmanager-ingress-tls
EOF
kubectl create secret -n monitoring \
tls alertmanager-ingress-tls \
--key=/root/ssl/huanghuanhui.cloud.key \
--cert=/root/ssl/huanghuanhui.cloud.crt
kubectl apply -f ~/prometheus-yml/alertmanager-Ingress.yml
访问地址:alertmanager.huanghuanhui.cloud
钉钉集成:alertmanager-webhook-dingtalk
cat > ~/prometheus-yml/alertmanager-webhook-dingtalk-Deployment.yaml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:name: alertmanager-webhook-dingtalknamespace: monitoring
data:config.yaml: |-templates:- /config/template.tmpltargets:webhook1:url: https://oapi.dingtalk.com/robot/send?access_token=423eedfe3802198314e15f712f0578545b74a44cb982723623db2fb034bdc83esecret: SECd3c53fbbb1df76a987a658e0ca759ef371ae955ff731af8945219e99d143d3ae# 告警模版(也就是钉钉收到怎样的信息模板)template.tmpl: |-{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{ end }}{{ define "__alert_list" }}{{ range . }}---{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}>- **告警状态 :** {{ .Status }}>- **告警级别 :** **{{ .Labels.severity }}**>- **告警类型 :** {{ .Labels.alertname }}>- **告警主机 :** {{ .Labels.instance }} >- **告警主题 :** {{ .Annotations.summary }}>- **告警信息 :** {{ index .Annotations "description" }}>- **告警时间 :** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}{{ end }}{{ end }}{{ define "__resolved_list" }}{{ range . }}---{{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}>- **告警状态 :** {{ .Status }}>- **告警类型 :** {{ .Labels.alertname }}>- **告警主机 :** {{ .Labels.instance }} >- **告警主题 :** {{ .Annotations.summary }}>- **告警信息 :** {{ index .Annotations "description" }}>- **告警时间 :** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}>- **恢复时间 :** {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}{{ end }}{{ end }}{{ define "default.title" }}{{ template "__subject" . }}{{ end }}{{ define "default.content" }}{{ if gt (len .Alerts.Firing) 0 }}**Prometheus-Alertmanager 监控到{{ .Alerts.Firing | len }}个故障**{{ template "__alert_list" .Alerts.Firing }}---{{ end }}{{ if gt (len .Alerts.Resolved) 0 }}**恢复{{ .Alerts.Resolved | len }}个故障**{{ template "__resolved_list" .Alerts.Resolved }}{{ end }}{{ end }}{{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}{{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}{{ template "default.title" . }}{{ template "default.content" . }}---
apiVersion: apps/v1
kind: Deployment
metadata:name: alertmanager-webhook-dingtalknamespace: monitoring
spec:replicas: 1selector:matchLabels:app: alertmanager-webhook-dingtalktemplate:metadata:labels:app: alertmanager-webhook-dingtalkspec:volumes:- name: configconfigMap:name: alertmanager-webhook-dingtalkcontainers:- name: alertmanager-webhook-dingtalkimage: ccr.ccs.tencentyun.com/huanghuanhui/prometheus-alertmanager-webhook-dingtalk:v1imagePullPolicy: Alwaysargs:- --web.listen-address=:8060- --config.file=/config/config.yamlvolumeMounts:- name: configmountPath: /configresources:limits:cpu: 100mmemory: 100Miports:- name: httpcontainerPort: 8060---
apiVersion: v1
kind: Service
metadata:name: alertmanager-webhook-dingtalknamespace: monitoring
spec:selector:app: alertmanager-webhook-dingtalkports:- name: httpport: 8060targetPort: http
EOF
kubectl apply -f ~/prometheus-yml/alertmanager-webhook-dingtalk-Deployment.yaml
yml 截图
钉钉告警截图