398 lines
14 KiB
Diff
398 lines
14 KiB
Diff
|
|
From 480c0fc479ec882786cdb58d699cf84ce5995531 Mon Sep 17 00:00:00 2001
|
|||
|
|
From: zhuofeng <zhuofeng2@huawei.com>
|
|||
|
|
Date: Fri, 14 Feb 2025 09:42:27 +0800
|
|||
|
|
Subject: [PATCH] fix bug of ebpf and ai_block_io
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
src/c/ebpf_collector/ebpf_collector.bpf.c | 357 ------------------
|
|||
|
|
.../sentryPlugins/ai_block_io/detector.py | 3 +-
|
|||
|
|
2 files changed, 2 insertions(+), 358 deletions(-)
|
|||
|
|
|
|||
|
|
diff --git a/src/c/ebpf_collector/ebpf_collector.bpf.c b/src/c/ebpf_collector/ebpf_collector.bpf.c
|
|||
|
|
index 417618d..7a2f481 100644
|
|||
|
|
--- a/src/c/ebpf_collector/ebpf_collector.bpf.c
|
|||
|
|
+++ b/src/c/ebpf_collector/ebpf_collector.bpf.c
|
|||
|
|
@@ -590,361 +590,4 @@ int kprobe_bio_endio(struct pt_regs *regs)
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
-// start get_tag
|
|||
|
|
-SEC("kprobe/blk_mq_get_tag")
|
|||
|
|
-int kprobe_blk_mq_get_tag(struct pt_regs *regs)
|
|||
|
|
-{
|
|||
|
|
- u64 tagkey = bpf_get_current_task();
|
|||
|
|
- u64 value = (u64)PT_REGS_PARM1(regs);
|
|||
|
|
- (void)bpf_map_update_elem(&tag_args, &tagkey, &value, BPF_ANY);
|
|||
|
|
-
|
|||
|
|
- struct blk_mq_alloc_data *bd;
|
|||
|
|
- struct request_queue *q;
|
|||
|
|
- struct backing_dev_info *backing_dev_info;
|
|||
|
|
- struct device *owner;
|
|||
|
|
- dev_t devt;
|
|||
|
|
- unsigned int cmd_flags = 0;
|
|||
|
|
-
|
|||
|
|
- bd = (struct blk_mq_alloc_data *)value;
|
|||
|
|
- bpf_core_read(&q, sizeof(q), &bd->q);
|
|||
|
|
- bpf_core_read(&backing_dev_info, sizeof(backing_dev_info), &q->backing_dev_info);
|
|||
|
|
- bpf_core_read(&owner, sizeof(owner), &backing_dev_info->owner);
|
|||
|
|
- bpf_core_read(&devt, sizeof(devt), &owner->devt);
|
|||
|
|
- int major = MAJOR(devt);
|
|||
|
|
- int first_minor = MINOR(devt);
|
|||
|
|
-
|
|||
|
|
- if (major == 0) {
|
|||
|
|
- log_event(STAGE_GET_TAG, PERIOD_START, ERROR_MAJOR_ZERO);
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- u32 key = find_matching_key_get_tag(major, first_minor);
|
|||
|
|
- if (key >= MAP_SIZE) {
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct io_counter *counterp, zero = {};
|
|||
|
|
- init_io_counter(&zero, major, first_minor);
|
|||
|
|
- counterp = bpf_map_lookup_elem(&tag_map, &tagkey);
|
|||
|
|
- if (counterp) {
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
- long err = bpf_map_update_elem(&tag_map, &tagkey, &zero, BPF_NOEXIST);
|
|||
|
|
- if (err) {
|
|||
|
|
- log_event(STAGE_GET_TAG, PERIOD_START, ERROR_UPDATE_FAIL);
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- u64 curr_start_range = zero.start_time / THRESHOLD;
|
|||
|
|
-
|
|||
|
|
- struct update_params params = {
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .cmd_flags = cmd_flags,
|
|||
|
|
- .curr_start_range = curr_start_range,
|
|||
|
|
- };
|
|||
|
|
-
|
|||
|
|
- struct stage_data *curr_data;
|
|||
|
|
- curr_data = bpf_map_lookup_elem(&tag_res, &key);
|
|||
|
|
- if (!curr_data) {
|
|||
|
|
- struct stage_data new_data = {
|
|||
|
|
- .start_count = 1,
|
|||
|
|
- .finish_count = 0,
|
|||
|
|
- .finish_over_time = 0,
|
|||
|
|
- .duration = 0,
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .io_type = "",
|
|||
|
|
- };
|
|||
|
|
- blk_fill_rwbs(new_data.io_type, cmd_flags);
|
|||
|
|
- bpf_map_update_elem(&tag_res, &key, &new_data, 0);
|
|||
|
|
- } else {
|
|||
|
|
- update_curr_data_in_start(curr_data, ¶ms);
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct time_range_io_count *curr_data_time_range;
|
|||
|
|
- curr_data_time_range = bpf_map_lookup_elem(&tag_res_2, &curr_start_range);
|
|||
|
|
- if (curr_data_time_range == NULL) {
|
|||
|
|
- struct time_range_io_count new_data = { .count = {0} };
|
|||
|
|
- bpf_map_update_elem(&tag_res_2, &curr_start_range, &new_data, 0);
|
|||
|
|
- } else {
|
|||
|
|
- if (key < MAP_SIZE && key >= 0) {
|
|||
|
|
- __sync_fetch_and_add(&curr_data_time_range->count[key], 1);
|
|||
|
|
- }
|
|||
|
|
- }
|
|||
|
|
- return 0;
|
|||
|
|
-}
|
|||
|
|
-
|
|||
|
|
-// finish get_tag
|
|||
|
|
-SEC("kretprobe/blk_mq_get_tag")
|
|||
|
|
-int kretprobe_blk_mq_get_tag(struct pt_regs *regs)
|
|||
|
|
-{
|
|||
|
|
- u64 tagkey = bpf_get_current_task();
|
|||
|
|
- u64 *tagargs = (u64 *)bpf_map_lookup_elem(&tag_args, &tagkey);
|
|||
|
|
- if (tagargs == NULL) {
|
|||
|
|
- bpf_map_delete_elem(&tag_args, &tagkey);
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct blk_mq_alloc_data *bd;
|
|||
|
|
- struct request_queue *q;
|
|||
|
|
- struct backing_dev_info *backing_dev_info;
|
|||
|
|
- struct device *owner;
|
|||
|
|
- dev_t devt;
|
|||
|
|
- unsigned int cmd_flags = 0;
|
|||
|
|
-
|
|||
|
|
- bd = (struct blk_mq_alloc_data *)*tagargs;
|
|||
|
|
- bpf_core_read(&q, sizeof(q), &bd->q);
|
|||
|
|
- bpf_core_read(&backing_dev_info, sizeof(backing_dev_info), &q->backing_dev_info);
|
|||
|
|
- bpf_core_read(&owner, sizeof(owner), &backing_dev_info->owner);
|
|||
|
|
- bpf_core_read(&devt, sizeof(devt), &owner->devt);
|
|||
|
|
- int major = MAJOR(devt);
|
|||
|
|
- int first_minor = MINOR(devt);
|
|||
|
|
-
|
|||
|
|
- if (major == 0) {
|
|||
|
|
- log_event(STAGE_GET_TAG, PERIOD_END, ERROR_MAJOR_ZERO);
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- u32 key = find_matching_key_get_tag(major, first_minor);
|
|||
|
|
- if (key >= MAP_SIZE) {
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct io_counter *counterp = bpf_map_lookup_elem(&tag_map, &tagkey);
|
|||
|
|
- if (!counterp) {
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- u64 duration = bpf_ktime_get_ns() - counterp->start_time;
|
|||
|
|
- u64 curr_start_range = counterp->start_time / THRESHOLD;
|
|||
|
|
-
|
|||
|
|
- struct update_params params = {
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .cmd_flags = cmd_flags,
|
|||
|
|
- .curr_start_range = curr_start_range,
|
|||
|
|
- };
|
|||
|
|
-
|
|||
|
|
- struct stage_data *curr_data;
|
|||
|
|
- curr_data = bpf_map_lookup_elem(&tag_res, &key);
|
|||
|
|
- if (curr_data == NULL && duration > DURATION_THRESHOLD) {
|
|||
|
|
- struct stage_data new_data = {
|
|||
|
|
- .start_count = 1,
|
|||
|
|
- .finish_count = 1,
|
|||
|
|
- .finish_over_time = 1,
|
|||
|
|
- .duration = 0,
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .io_type = "",
|
|||
|
|
- };
|
|||
|
|
- blk_fill_rwbs(new_data.io_type, cmd_flags);
|
|||
|
|
- bpf_map_update_elem(&tag_res, &key, &new_data, 0);
|
|||
|
|
- } else if (curr_data == NULL) {
|
|||
|
|
- struct stage_data new_data = {
|
|||
|
|
- .start_count = 1,
|
|||
|
|
- .finish_count = 1,
|
|||
|
|
- .finish_over_time = 0,
|
|||
|
|
- .duration = 0,
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .io_type = "",
|
|||
|
|
- };
|
|||
|
|
- blk_fill_rwbs(new_data.io_type, cmd_flags);
|
|||
|
|
- bpf_map_update_elem(&tag_res, &key, &new_data, 0);
|
|||
|
|
- } else {
|
|||
|
|
- curr_data->duration += duration;
|
|||
|
|
- update_curr_data_in_finish(curr_data, ¶ms, duration);
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct time_range_io_count *curr_data_time_range;
|
|||
|
|
- curr_data_time_range = bpf_map_lookup_elem(&tag_res_2, &curr_start_range);
|
|||
|
|
- if (curr_data_time_range == NULL) {
|
|||
|
|
- struct time_range_io_count new_data = { .count = {0} };
|
|||
|
|
- bpf_map_update_elem(&tag_res_2, &curr_start_range, &new_data, 0);
|
|||
|
|
- } else {
|
|||
|
|
- if (key < MAP_SIZE && curr_data_time_range->count[key] > 0) {
|
|||
|
|
- __sync_fetch_and_add(&curr_data_time_range->count[key], -1);
|
|||
|
|
- }
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- bpf_map_delete_elem(&tag_map, &tagkey);
|
|||
|
|
- bpf_map_delete_elem(&tag_args, &tagkey);
|
|||
|
|
- return 0;
|
|||
|
|
-}
|
|||
|
|
-
|
|||
|
|
-// start wbt
|
|||
|
|
-SEC("kprobe/wbt_wait")
|
|||
|
|
-int kprobe_wbt_wait(struct pt_regs *regs)
|
|||
|
|
-{
|
|||
|
|
- u64 wbtkey = bpf_get_current_task();
|
|||
|
|
- u64 value = (u64)PT_REGS_PARM2(regs);
|
|||
|
|
- (void)bpf_map_update_elem(&wbt_args, &wbtkey, &value, BPF_ANY);
|
|||
|
|
-
|
|||
|
|
- struct bio *bio;
|
|||
|
|
- struct gendisk *curr_rq_disk;
|
|||
|
|
- int major, first_minor;
|
|||
|
|
- unsigned int cmd_flags;
|
|||
|
|
-
|
|||
|
|
- bio = (struct bio *)value;
|
|||
|
|
- bpf_core_read(&curr_rq_disk, sizeof(curr_rq_disk), &bio->bi_disk);
|
|||
|
|
- bpf_core_read(&major, sizeof(major), &curr_rq_disk->major);
|
|||
|
|
- bpf_core_read(&first_minor, sizeof(first_minor), &curr_rq_disk->first_minor);
|
|||
|
|
- bpf_core_read(&cmd_flags, sizeof(cmd_flags), &bio->bi_opf);
|
|||
|
|
-
|
|||
|
|
- if (major == 0) {
|
|||
|
|
- log_event(STAGE_WBT, PERIOD_START, ERROR_MAJOR_ZERO);
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- u32 key = find_matching_key_wbt(major, first_minor);
|
|||
|
|
- if (key >= MAP_SIZE) {
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct io_counter *counterp, zero = {};
|
|||
|
|
- init_io_counter(&zero, major, first_minor);
|
|||
|
|
- counterp = bpf_map_lookup_elem(&wbt_map, &wbtkey);
|
|||
|
|
- if (counterp) {
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
- long err = bpf_map_update_elem(&wbt_map, &wbtkey, &zero, BPF_NOEXIST);
|
|||
|
|
- if (err) {
|
|||
|
|
- log_event(STAGE_WBT, PERIOD_START, ERROR_UPDATE_FAIL);
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- u64 curr_start_range = zero.start_time / THRESHOLD;
|
|||
|
|
-
|
|||
|
|
- struct update_params params = {
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .cmd_flags = cmd_flags,
|
|||
|
|
- .curr_start_range = curr_start_range,
|
|||
|
|
- };
|
|||
|
|
-
|
|||
|
|
- struct stage_data *curr_data;
|
|||
|
|
- curr_data = bpf_map_lookup_elem(&wbt_res, &key);
|
|||
|
|
- if (!curr_data) {
|
|||
|
|
- struct stage_data new_data = {
|
|||
|
|
- .start_count = 1,
|
|||
|
|
- .finish_count = 0,
|
|||
|
|
- .finish_over_time = 0,
|
|||
|
|
- .duration = 0,
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .io_type = "",
|
|||
|
|
- };
|
|||
|
|
- blk_fill_rwbs(new_data.io_type, cmd_flags);
|
|||
|
|
- bpf_map_update_elem(&wbt_res, &key, &new_data, 0);
|
|||
|
|
- } else {
|
|||
|
|
- update_curr_data_in_start(curr_data, ¶ms);
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct time_range_io_count *curr_data_time_range;
|
|||
|
|
- curr_data_time_range = bpf_map_lookup_elem(&wbt_res_2, &curr_start_range);
|
|||
|
|
- if (curr_data_time_range == NULL) {
|
|||
|
|
- struct time_range_io_count new_data = { .count = {0} };
|
|||
|
|
- bpf_map_update_elem(&wbt_res_2, &curr_start_range, &new_data, 0);
|
|||
|
|
- } else {
|
|||
|
|
- if (key < MAP_SIZE && key >= 0) {
|
|||
|
|
- __sync_fetch_and_add(&curr_data_time_range->count[key], 1);
|
|||
|
|
- }
|
|||
|
|
- }
|
|||
|
|
- return 0;
|
|||
|
|
-}
|
|||
|
|
-
|
|||
|
|
-// finish wbt
|
|||
|
|
-SEC("kretprobe/wbt_wait")
|
|||
|
|
-int kretprobe_wbt_wait(struct pt_regs *regs)
|
|||
|
|
-{
|
|||
|
|
- u64 wbtkey = bpf_get_current_task();
|
|||
|
|
- u64 *wbtargs = (u64 *)bpf_map_lookup_elem(&wbt_args, &wbtkey);
|
|||
|
|
- if (wbtargs == NULL) {
|
|||
|
|
- bpf_map_delete_elem(&wbt_args, &wbtkey);
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct bio *bio;
|
|||
|
|
- struct gendisk *curr_rq_disk;
|
|||
|
|
- int major, first_minor;
|
|||
|
|
- unsigned int cmd_flags;
|
|||
|
|
-
|
|||
|
|
- bio = (struct bio *)(*wbtargs);
|
|||
|
|
- bpf_core_read(&curr_rq_disk, sizeof(curr_rq_disk), &bio->bi_disk);
|
|||
|
|
- bpf_core_read(&major, sizeof(major), &curr_rq_disk->major);
|
|||
|
|
- bpf_core_read(&first_minor, sizeof(first_minor), &curr_rq_disk->first_minor);
|
|||
|
|
- bpf_core_read(&cmd_flags, sizeof(cmd_flags), &bio->bi_opf);
|
|||
|
|
-
|
|||
|
|
- if (major == 0) {
|
|||
|
|
- log_event(STAGE_WBT, PERIOD_END, ERROR_MAJOR_ZERO);
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- u32 key = find_matching_key_wbt(major, first_minor);
|
|||
|
|
- if (key >= MAP_SIZE) {
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct io_counter *counterp = bpf_map_lookup_elem(&wbt_map, &wbtkey);
|
|||
|
|
- if (!counterp) {
|
|||
|
|
- return 0;
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- u64 duration = bpf_ktime_get_ns() - counterp->start_time;
|
|||
|
|
- u64 curr_start_range = counterp->start_time / THRESHOLD;
|
|||
|
|
-
|
|||
|
|
- struct update_params params = {
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .cmd_flags = cmd_flags,
|
|||
|
|
- .curr_start_range = curr_start_range,
|
|||
|
|
- };
|
|||
|
|
-
|
|||
|
|
- struct stage_data *curr_data;
|
|||
|
|
- curr_data = bpf_map_lookup_elem(&wbt_res, &key);
|
|||
|
|
- if (curr_data == NULL && duration > DURATION_THRESHOLD) {
|
|||
|
|
- struct stage_data new_data = {
|
|||
|
|
- .start_count = 1,
|
|||
|
|
- .finish_count = 1,
|
|||
|
|
- .finish_over_time = 1,
|
|||
|
|
- .duration = 0,
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- .io_type = "",
|
|||
|
|
- };
|
|||
|
|
- blk_fill_rwbs(new_data.io_type, cmd_flags);
|
|||
|
|
- bpf_map_update_elem(&wbt_res, &key, &new_data, 0);
|
|||
|
|
- } else if (curr_data == NULL) {
|
|||
|
|
- struct stage_data new_data = {
|
|||
|
|
- .start_count = 1,
|
|||
|
|
- .finish_count = 1,
|
|||
|
|
- .finish_over_time = 0,
|
|||
|
|
- .duration = 0,
|
|||
|
|
- .io_type = "",
|
|||
|
|
- .major = major,
|
|||
|
|
- .first_minor = first_minor,
|
|||
|
|
- };
|
|||
|
|
- blk_fill_rwbs(new_data.io_type, cmd_flags);
|
|||
|
|
- bpf_map_update_elem(&wbt_res, &key, &new_data, 0);
|
|||
|
|
- } else {
|
|||
|
|
- curr_data->duration += duration;
|
|||
|
|
- update_curr_data_in_finish(curr_data, ¶ms, duration);
|
|||
|
|
- }
|
|||
|
|
-
|
|||
|
|
- struct time_range_io_count *curr_data_time_range;
|
|||
|
|
- curr_data_time_range = bpf_map_lookup_elem(&wbt_res_2, &curr_start_range);
|
|||
|
|
- if (curr_data_time_range == NULL) {
|
|||
|
|
- struct time_range_io_count new_data = { .count = {0} };
|
|||
|
|
- bpf_map_update_elem(&wbt_res_2, &curr_start_range, &new_data, 0);
|
|||
|
|
- } else {
|
|||
|
|
- if (key < MAP_SIZE && curr_data_time_range->count[key] > 0) {
|
|||
|
|
- __sync_fetch_and_add(&curr_data_time_range->count[key], -1);
|
|||
|
|
- }
|
|||
|
|
- }
|
|||
|
|
- bpf_map_delete_elem(&wbt_map, &wbtkey);
|
|||
|
|
- bpf_map_delete_elem(&wbt_args, &wbtkey);
|
|||
|
|
- return 0;
|
|||
|
|
-}
|
|||
|
|
-
|
|||
|
|
char _license[] SEC("license") = "GPL";
|
|||
|
|
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
|||
|
|
index 27fb7f7..2688cb1 100644
|
|||
|
|
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
|||
|
|
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
|||
|
|
@@ -55,11 +55,12 @@ class Detector:
|
|||
|
|
detection_result = self._slidingWindow.is_slow_io_event(metric_value)
|
|||
|
|
# 检测到慢周期,由Detector负责打印info级别日志
|
|||
|
|
if detection_result[0][1]:
|
|||
|
|
+ ai_threshold = "None" if detection_result[2] is None else round(detection_result[2], 3)
|
|||
|
|
logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, '
|
|||
|
|
f'stage: {self._metric_name.stage_name}, '
|
|||
|
|
f'iotype: {self._metric_name.io_access_type_name}, '
|
|||
|
|
f'type: {self._metric_name.metric_name}, '
|
|||
|
|
- f'ai_threshold: {round(detection_result[2], 3)}, '
|
|||
|
|
+ f'ai_threshold: {ai_threshold}, '
|
|||
|
|
f'curr_val: {metric_value}')
|
|||
|
|
else:
|
|||
|
|
logging.debug(f'Detection result: {str(detection_result)}')
|
|||
|
|
--
|
|||
|
|
2.33.0
|
|||
|
|
|