未验证 提交 21904f1e 编写于 作者: 青牛踏雪 提交者: GitHub

add kafka dashboard and alert rules based on categraf acquisition (#1607)

* add kafka dashboard and alert rules based on categraf acquisition

* add kafka dashboard and alert rules based on categraf acquisition
上级 b5d5ecba
[
{
"cate": "prometheus",
"name": "kafka 数据有丢失风险-副本数小于3",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka",
"type=categraf"
]
},
{
"cate": "prometheus",
"name": "kafka 服务宕机",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "kafka_broker_info{service=~\"kafka\"} < 1",
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"type=categraf",
"service=kafka"
]
},
{
"cate": "prometheus",
"name": "kafka 消费能力不足-延迟超过5分钟",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "max(kafka_consumer_lag_millis) by (topic, consumergroup) / 1000 > 300",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka",
"type=categraf"
]
}
]
\ No newline at end of file
{
"name": "Kafka By Categraf",
"tags": "Kafka Prometheus Categraf",
"ident": "",
"configs": {
"var": [
{
"name": "cluster",
"definition": "label_values(kafka_brokers, cluster)",
"type": "query"
}
],
"version": "2.0.0",
"panels": [
{
"id": "51502c3a-dd6f-41c7-b8f1-87b88826c96e",
"type": "row",
"name": "overview",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "51502c3a-dd6f-41c7-b8f1-87b88826c96e",
"isResizable": false
},
"collapsed": true
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_brokers{cluster=\"$cluster\"}"
}
],
"name": "brokers",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 0,
"y": 1,
"i": "e2c1d271-ec43-4821-aa19-451e856af755",
"isResizable": true
},
"id": "e2c1d271-ec43-4821-aa19-451e856af755"
},
{
"targets": [
{
"refId": "A",
"expr": "count(count by (topic) (kafka_topic_partitions{cluster=\"$cluster\"}))"
}
],
"name": "topics",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 6,
"y": 1,
"i": "fd3a0b9f-fd67-4360-a94c-869fee7b5b98",
"isResizable": true
},
"id": "fd3a0b9f-fd67-4360-a94c-869fee7b5b98"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kafka_topic_partitions{cluster=\"$cluster\"})"
}
],
"name": "partitions",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 12,
"y": 1,
"i": "e228d857-746b-41b6-8d2d-0152453c46f4",
"isResizable": true
},
"id": "e228d857-746b-41b6-8d2d-0152453c46f4"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kafka_topic_partition_replicas{cluster=\"$cluster\"})"
}
],
"name": "Replicas",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 18,
"y": 1,
"i": "85438099-8d6b-4817-b9b9-1d0ed36029cd",
"isResizable": true
},
"id": "85438099-8d6b-4817-b9b9-1d0ed36029cd"
},
{
"id": "0db4aac4-86cf-44cd-950e-6c6a99be8ff4",
"type": "row",
"name": "throughput",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 4,
"i": "0db4aac4-86cf-44cd-950e-6c6a99be8ff4",
"isResizable": false
},
"collapsed": true
},
{
"targets": [
{
"expr": "sum(rate(kafka_topic_partition_current_offset{cluster=\"$cluster\"}[1m])) by (topic)"
}
],
"name": "Messages produced per second",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 0,
"y": 5,
"i": "c2ec4036-3081-45cc-b672-024c6df93833",
"isResizable": true
},
"id": "c2ec4036-3081-45cc-b672-024c6df93833"
},
{
"targets": [
{
"expr": "sum(rate(kafka_consumergroup_current_offset{cluster=\"$cluster\"}[1m])) by (topic)"
}
],
"name": "Messages consumed per second",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 5,
"i": "7ad651a6-c12c-4d46-8d01-749fa776faef",
"isResizable": true
},
"id": "7ad651a6-c12c-4d46-8d01-749fa776faef"
},
{
"targets": [
{
"expr": "sum(kafka_consumer_lag_millis{cluster=\"$cluster\"}) by (consumergroup, topic)",
"legend": "{{consumergroup}} (topic: {{topic}})"
}
],
"name": "Latency by Consumer Group",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "humantimeMilliseconds"
},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 5,
"i": "855aa8f5-0c51-42d4-b9a4-5460b7cd0f5a",
"isResizable": true
},
"id": "855aa8f5-0c51-42d4-b9a4-5460b7cd0f5a"
},
{
"id": "20166830-7f85-4665-8f39-bf904267af29",
"type": "row",
"name": "patition/replicate",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 18,
"i": "20166830-7f85-4665-8f39-bf904267af29",
"isResizable": false
},
"collapsed": true
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_topic_partitions{cluster=\"$cluster\"}",
"legend": "{{topic}}"
}
],
"name": "Partitions per Topic",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 19,
"i": "8837a52e-c9eb-4afa-acc1-c3a5dac72d3b",
"isResizable": true
},
"id": "8837a52e-c9eb-4afa-acc1-c3a5dac72d3b"
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_topic_partition_under_replicated_partition{cluster=\"$cluster\"}",
"legend": "{{topic}}-{{partition}}"
}
],
"name": "Partitions Under Replicated",
"description": "副本不同步预案\n1. Restart the Zookeeper leader.\n2. Restart the broker\\brokers that are not replicating some of the partitions.",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 19,
"i": "dd615767-dda7-4da6-b37f-0d484553aac6",
"isResizable": true
},
"id": "dd615767-dda7-4da6-b37f-0d484553aac6"
}
],
"links": [
{
"title": "文档",
"url": "https://github.com/ccfos/nightingale/tree/main/integrations/kafka/markdown/",
"targetBlank": true
}
]
}
}
\ No newline at end of file
## VictoriaMetrics Dashboard & Alerts
使用[categraf](https://github.com/flashcatcloud/categraf)[inputs.kafka](https://github.com/flashcatcloud/categraf/tree/main/inputs/kafka)插件采集[kafka](https://kafka.apache.org/)服务监控指标数据:
### 配置文件示例:
下面为配置示例,如果是多个kafka就可以写多个[[instances]];
```toml
[[instances]]
log_level = "error"
kafka_uris = ["192.168.0.250:9092"]
labels = { cluster="kafka-cluster", service="kafka" }
```
### 告警规则
![alerts](./alerts..png)
[alerts](../alerts/kafka_by_categraf.json)
### 仪表盘:
![dashboard](./dashboards.png)
[dashboard](../dashboards/kafka_by_categraf.json)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册