提交 80d6ee24 编写于 作者: X Xiaoguang Wang 提交者: Shile Zhang

alinux: blk: add iohang check function

Background:
  We do not have a dependable block layer interface to determine whether
block device has io requests which have not been completed for somewhat
long time. Currently we have 'in_flight' interface, it counts the number
of I/O requests that have been issued to the device driver but have
not yet completed, and it does not include I/O requests that are in the
queue but not yet issued to the device driver, which means it will not
count io requests that have been stucked in block layer.
  Also say that there are steady io requests issued to device driver,
'in_flight' maybe always non-zero, but you could not determine whether
there is one io request which has not been completed for too long.

Solution:
  To find io requests which have not been completed for too long, here
add 3 new inferfaces:
  /sys/block/vdb/queue/hang_threshold
If one io request's running time has been greater than this value, count
this io as hang.

  /sys/block/vdb/hang
Show read/write io requests' hang counter.

  /sys/kernel/debug/block/vdb/rq_hang
Show all hang io requests's detailed info, like below:
  ffff97db96301200 {.op=WRITE, .cmd_flags=SYNC, .rq_flags=STARTED|
ELVPRIV|IO_STAT|STATS, .state=in_flight, .tag=30, .internal_tag=169,
.start_time_ns=140634088407, .io_start_time_ns=140634102958,
.current_time=146497371953, .bio = ffff97db91e8e000,
.bio_pages = { ffffd096a0602540 }, .bio = ffff97db91e8ec00,
.bio_pages = { ffffd096a070eec0 }, .bio = ffff97db91e8f600,
.bio_pages = { ffffd096a0424cc0 }, .bio = ffff97db91e8f300,
.bio_pages = { ffffd096a0600a80 }}

With above info, we can easily see this request's latency distribution,
and see next patch for bio_pages's usage.

Note, /sys/kernel/debug/block/vdb/rq_hang only exists in blk-mq device driver
and needs CONFIG_BLK_DEBUG_FS enabled.
Signed-off-by: NXiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Reviewed-by: NJoseph Qi <joseph.qi@linux.alibaba.com>
上级 4cf2f5de
......@@ -1017,6 +1017,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
q->last_merge = NULL;
q->end_sector = 0;
q->boundary_rq = NULL;
q->rq_hang_threshold = BLK_REQ_HANG_THRESHOLD;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
if (q->id < 0)
......
......@@ -213,6 +213,39 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
return count;
}
static void blk_mq_debugfs_rq_hang_show(struct seq_file *m, struct request *rq);
static void blk_mq_check_rq_hang(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
struct seq_file *m = priv;
u64 now = ktime_get_ns();
u64 duration;
duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC);
if (duration < rq->q->rq_hang_threshold)
return;
/* See comments in blk_mq_check_expired() */
if (!refcount_inc_not_zero(&rq->ref))
return;
duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC);
if (duration >= rq->q->rq_hang_threshold)
blk_mq_debugfs_rq_hang_show(m, rq);
if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
}
static int queue_rq_hang_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_rq_hang, m);
return 0;
}
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "poll_stat", 0400, queue_poll_stat_show },
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
......@@ -220,6 +253,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "state", 0600, queue_state_show, queue_state_write },
{ "write_hints", 0600, queue_write_hint_show, queue_write_hint_store },
{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
{ "rq_hang", 0400, queue_rq_hang_show, NULL },
{ },
};
......@@ -381,6 +415,56 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
}
EXPORT_SYMBOL_GPL(__blk_mq_debugfs_rq_show);
static void blk_mq_debugfs_rq_hang_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
struct bio *bio;
struct bio_vec *bvec;
int i;
seq_printf(m, "%px {.op=", rq);
if (op < ARRAY_SIZE(op_name) && op_name[op])
seq_printf(m, "%s", op_name[op]);
else
seq_printf(m, "%d", op);
seq_puts(m, ", .cmd_flags=");
blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
ARRAY_SIZE(cmd_flag_name));
seq_puts(m, ", .rq_flags=");
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name));
seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq)));
seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
rq->internal_tag);
seq_printf(m, ", .start_time_ns=%llu", rq->start_time_ns);
seq_printf(m, ", .io_start_time_ns=%llu", rq->io_start_time_ns);
seq_printf(m, ", .current_time=%llu", ktime_get_ns());
bio = rq->bio;
if (bio) {
while (1) {
seq_printf(m, ", .bio = %px", bio);
seq_puts(m, ", .bio_pages = { ");
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
if (!page)
continue;
seq_printf(m, "%px ", page);
}
seq_puts(m, "}");
bio = bio->bi_next;
if (bio == NULL)
break;
}
}
if (mq_ops->show_rq)
mq_ops->show_rq(m, rq);
seq_puts(m, "}\n");
return;
}
int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
{
return __blk_mq_debugfs_rq_show(m, list_entry_rq(v));
......
......@@ -88,6 +88,11 @@ struct mq_inflight {
unsigned int *inflight;
};
struct mq_hang {
struct hd_struct *part;
unsigned int *hang;
};
static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv,
bool reserved)
......@@ -133,6 +138,30 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
}
static void blk_mq_check_hang_rw(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
struct mq_hang *mh = priv;
u64 now = ktime_get_ns();
u64 duration;
duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC);
if (duration < rq->q->rq_hang_threshold)
return;
if (!mh->part->partno || rq->part == mh->part)
mh->hang[rq_data_dir(rq)]++;
}
void blk_mq_in_hang_rw(struct request_queue *q, struct hd_struct *part,
unsigned int hang[2])
{
struct mq_hang mh = { .part = part, .hang = hang, };
hang[0] = hang[1] = 0;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_hang_rw, &mh);
}
void blk_freeze_queue_start(struct request_queue *q)
{
int freeze_depth;
......@@ -476,7 +505,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
static void __blk_mq_free_request(struct request *rq)
void __blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct blk_mq_ctx *ctx = rq->mq_ctx;
......
......@@ -154,6 +154,8 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
void blk_mq_in_hang_rw(struct request_queue *q, struct hd_struct *part,
unsigned int hang[2]);
static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
{
......
......@@ -66,6 +66,13 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
void blk_queue_rq_hang_threshold(struct request_queue *q,
unsigned int hang_threshold)
{
q->rq_hang_threshold = hang_threshold;
}
EXPORT_SYMBOL_GPL(blk_queue_rq_hang_threshold);
void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
{
WARN_ON_ONCE(q->mq_ops);
......
......@@ -516,6 +516,27 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_dax(q), page);
}
static ssize_t queue_hang_threshold_show(struct request_queue *q, char *page)
{
return sprintf(page, "%u\n", q->rq_hang_threshold);
}
static ssize_t queue_hang_threshold_store(struct request_queue *q,
const char *page, size_t count)
{
unsigned int hang_threshold;
int err;
err = kstrtou32(page, 10, &hang_threshold);
if (err || hang_threshold == 0)
return -EINVAL;
blk_queue_rq_hang_threshold(q, hang_threshold);
return count;
}
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = 0644 },
.show = queue_requests_show,
......@@ -695,6 +716,12 @@ static struct queue_sysfs_entry queue_wb_lat_entry = {
.store = queue_wb_lat_store,
};
static struct queue_sysfs_entry queue_hang_entry = {
.attr = {.name = "hang_threshold", .mode = 0644 },
.show = queue_hang_threshold_show,
.store = queue_hang_threshold_store,
};
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static struct queue_sysfs_entry throtl_sample_time_entry = {
.attr = {.name = "throttle_sample_time", .mode = 0644 },
......@@ -737,6 +764,7 @@ static struct attribute *default_attrs[] = {
&queue_dax_entry.attr,
&queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr,
&queue_hang_entry.attr,
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
&throtl_sample_time_entry.attr,
#endif
......
......@@ -94,6 +94,39 @@ void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
inflight[1] = atomic_read(&part->in_flight[1]);
}
static void blk_in_hang_rw(struct request_queue *q, struct hd_struct *part,
unsigned int hang[2])
{
u64 now, duration;
unsigned long flags;
struct request *rq, *tmp;
now = ktime_get_ns();
spin_lock_irqsave(q->queue_lock, flags);
list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC);
if (duration < rq->q->rq_hang_threshold)
continue;
if (!part->partno || rq->part == part)
hang[rq_data_dir(rq)]++;
}
spin_unlock_irqrestore(q->queue_lock, flags);
}
void part_in_hang_rw(struct request_queue *q, struct hd_struct *part,
unsigned int hang[2])
{
if (q->mq_ops) {
blk_mq_in_hang_rw(q, part, hang);
return;
}
blk_in_hang_rw(q, part, hang);
}
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
{
struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
......@@ -1157,6 +1190,7 @@ static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(hang, 0444, part_hang_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
......@@ -1179,6 +1213,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
&dev_attr_hang.attr,
&dev_attr_badblocks.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
......
......@@ -164,6 +164,17 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}
ssize_t part_hang_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct hd_struct *p = dev_to_part(dev);
struct request_queue *q = part_to_disk(p)->queue;
unsigned int hang[2];
part_in_hang_rw(q, p, hang);
return sprintf(buf, "%8u %8u\n", hang[0], hang[1]);
}
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
......@@ -195,6 +206,7 @@ static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(hang, 0444, part_hang_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
......@@ -209,6 +221,7 @@ static struct attribute *part_attrs[] = {
&dev_attr_discard_alignment.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
&dev_attr_hang.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
......
......@@ -233,6 +233,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void __blk_mq_free_request(struct request *rq);
void blk_mq_free_request(struct request *rq);
bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
......
......@@ -436,6 +436,12 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
#endif /* CONFIG_BLK_DEV_ZONED */
/*
* default request hang threshold, unit is millisecond. If one request does
* not complete in this threashold time, consider this request as hang.
*/
#define BLK_REQ_HANG_THRESHOLD 5000
struct request_queue {
/*
* Together with queue_head for cacheline sharing
......@@ -581,6 +587,7 @@ struct request_queue {
unsigned int request_fn_active;
unsigned int rq_timeout;
unsigned int rq_hang_threshold;
int poll_nsec;
struct blk_stat_callback *poll_cb;
......@@ -1264,6 +1271,8 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
extern void blk_queue_rq_hang_threshold(struct request_queue *q,
unsigned int threshold);
/*
* Number of physical segments as sent to the device.
......
......@@ -398,6 +398,8 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part,
int rw);
void part_inc_in_flight(struct request_queue *q, struct hd_struct *part,
int rw);
void part_in_hang_rw(struct request_queue *q, struct hd_struct *part,
unsigned int hang[2]);
static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
{
......@@ -643,6 +645,8 @@ extern ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t part_inflight_show(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t part_hang_show(struct device *dev,
struct device_attribute *attr, char *buf);
#ifdef CONFIG_FAIL_MAKE_REQUEST
extern ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册