blk-mq: don't allocate driver tag upfront for flush rq

The idea behind it is simple:

1) for none scheduler, driver tag has to be borrowed for flush rq,
   otherwise we may run out of tag, and that causes an IO hang. And
   get/put driver tag is actually noop for none, so reordering tags
   isn't necessary at all.

2) for a real I/O scheduler, we need not allocate a driver tag upfront
   for flush rq. It works just fine to follow the same approach as
   normal requests: allocate driver tag for each rq just before calling
   ->queue_rq().

One driver visible change is that the driver tag isn't shared in the
flush request sequence. That won't be a problem, since we always do that
in legacy path.

Then flush rq need not be treated specially wrt. get/put driver tag.
This cleans up the code - for instance, reorder_tags_to_front() can be
removed, and we needn't worry about request ordering in dispatch list
for avoiding I/O deadlock.

Also we have to put the driver tag before requeueing.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Ming Lei 2017-11-02 23:24:38 +08:00 committed by Jens Axboe
parent 244c65a3cc
commit 923218f616
3 changed files with 37 additions and 81 deletions

View File

@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
/* release the tag's ownership to the req cloned from */ /* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags); spin_lock_irqsave(&fq->mq_flush_lock, flags);
hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); if (!q->elevator) {
flush_rq->tag = -1; blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
flush_rq->tag = -1;
} else {
blk_mq_put_driver_tag_hctx(hctx, flush_rq);
flush_rq->internal_tag = -1;
}
} }
running = &fq->flush_queue[fq->flush_running_idx]; running = &fq->flush_queue[fq->flush_running_idx];
@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
blk_rq_init(q, flush_rq); blk_rq_init(q, flush_rq);
/* /*
* Borrow tag from the first request since they can't * In case of none scheduler, borrow tag from the first request
* be in flight at the same time. And acquire the tag's * since they can't be in flight at the same time. And acquire
* ownership for flush req. * the tag's ownership for flush req.
*
* In case of IO scheduler, flush rq need to borrow scheduler tag
* just for cheating put/get driver tag.
*/ */
if (q->mq_ops) { if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->tag = first_rq->tag;
fq->orig_rq = first_rq;
hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); if (!q->elevator) {
blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); fq->orig_rq = first_rq;
flush_rq->tag = first_rq->tag;
hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
} else {
flush_rq->internal_tag = first_rq->internal_tag;
}
} }
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
hctx = blk_mq_map_queue(q, ctx->cpu); hctx = blk_mq_map_queue(q, ctx->cpu);
if (q->elevator) {
WARN_ON(rq->tag < 0);
blk_mq_put_driver_tag_hctx(hctx, rq);
}
/* /*
* After populating an empty queue, kick it to avoid stall. Read * After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io(). * the comment in flush_end_io().

View File

@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
return true; return true;
} }
if (has_sched) { if (has_sched)
rq->rq_flags |= RQF_SORTED; rq->rq_flags |= RQF_SORTED;
WARN_ON(rq->tag != -1);
}
return false; return false;
} }
/*
* Add flush/fua to the queue. If we fail getting a driver tag, then
* punt to the requeue list. Requeue will re-invoke us from a context
* that's safe to block from.
*/
static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
struct request *rq, bool can_block)
{
if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
blk_insert_flush(rq);
blk_mq_run_hw_queue(hctx, true);
} else
blk_mq_add_to_requeue_list(rq, false, true);
}
void blk_mq_sched_insert_request(struct request *rq, bool at_head, void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block) bool run_queue, bool async, bool can_block)
{ {
@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
/* flush rq in flush machinery need to be dispatched directly */ /* flush rq in flush machinery need to be dispatched directly */
if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
blk_mq_sched_insert_flush(hctx, rq, can_block); blk_insert_flush(rq);
return; goto run;
} }
WARN_ON(e && (rq->tag != -1));
if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
goto run; goto run;
@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
struct elevator_queue *e = hctx->queue->elevator; struct elevator_queue *e = hctx->queue->elevator;
if (e) {
struct request *rq, *next;
/*
* We bypass requests that already have a driver tag assigned,
* which should only be flushes. Flushes are only ever inserted
* as single requests, so we shouldn't ever hit the
* WARN_ON_ONCE() below (but let's handle it just in case).
*/
list_for_each_entry_safe(rq, next, list, queuelist) {
if (WARN_ON_ONCE(rq->tag != -1)) {
list_del_init(&rq->queuelist);
blk_mq_sched_bypass_insert(hctx, true, rq);
}
}
}
if (e && e->type->ops.mq.insert_requests) if (e && e->type->ops.mq.insert_requests)
e->type->ops.mq.insert_requests(hctx, list, false); e->type->ops.mq.insert_requests(hctx, list, false);
else else

View File

@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq)
{ {
struct request_queue *q = rq->q; struct request_queue *q = rq->q;
blk_mq_put_driver_tag(rq);
trace_block_rq_requeue(q, rq); trace_block_rq_requeue(q, rq);
wbt_requeue(q->rq_wb, &rq->issue_stat); wbt_requeue(q->rq_wb, &rq->issue_stat);
blk_mq_sched_requeue_request(rq); blk_mq_sched_requeue_request(rq);
@ -996,30 +998,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
return rq->tag != -1; return rq->tag != -1;
} }
/*
* If we fail getting a driver tag because all the driver tags are already
* assigned and on the dispatch list, BUT the first entry does not have a
* tag, then we could deadlock. For that case, move entries with assigned
* driver tags to the front, leaving the set of tagged requests in the
* same order, and the untagged set in the same order.
*/
static bool reorder_tags_to_front(struct list_head *list)
{
struct request *rq, *tmp, *first = NULL;
list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
if (rq == first)
break;
if (rq->tag != -1) {
list_move(&rq->queuelist, list);
if (!first)
first = rq;
}
}
return first != NULL;
}
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key) void *key)
{ {
@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist); rq = list_first_entry(list, struct request, queuelist);
if (!blk_mq_get_driver_tag(rq, &hctx, false)) { if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
if (!queued && reorder_tags_to_front(list))
continue;
/* /*
* The initial allocation attempt failed, so we need to * The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. * rerun the hardware queue when a tag is freed.
@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
nxt = list_first_entry(list, struct request, queuelist); nxt = list_first_entry(list, struct request, queuelist);
blk_mq_put_driver_tag(nxt); blk_mq_put_driver_tag(nxt);
} }
blk_mq_put_driver_tag_hctx(hctx, rq);
list_add(&rq->queuelist, list); list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq); __blk_mq_requeue_request(rq);
break; break;
@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
if (unlikely(is_flush_fua)) { if (unlikely(is_flush_fua)) {
blk_mq_put_ctx(data.ctx); blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio); blk_mq_bio_to_request(rq, bio);
if (q->elevator) {
blk_mq_sched_insert_request(rq, false, true, true, /* bypass scheduler for flush rq */
true); blk_insert_flush(rq);
} else { blk_mq_run_hw_queue(data.hctx, true);
blk_insert_flush(rq);
blk_mq_run_hw_queue(data.hctx, true);
}
} else if (plug && q->nr_hw_queues == 1) { } else if (plug && q->nr_hw_queues == 1) {
struct request *last = NULL; struct request *last = NULL;