xref: /linux/block/blk-mq.c (revision c9636244f86ae80b66e8ffb05ff755d85edf1988)
1 /*
2  * Block multiqueue core code
3  *
4  * Copyright (C) 2013-2014 Jens Axboe
5  * Copyright (C) 2013-2014 Christoph Hellwig
6  */
7 #include <linux/kernel.h>
8 #include <linux/module.h>
9 #include <linux/backing-dev.h>
10 #include <linux/bio.h>
11 #include <linux/blkdev.h>
12 #include <linux/kmemleak.h>
13 #include <linux/mm.h>
14 #include <linux/init.h>
15 #include <linux/slab.h>
16 #include <linux/workqueue.h>
17 #include <linux/smp.h>
18 #include <linux/llist.h>
19 #include <linux/list_sort.h>
20 #include <linux/cpu.h>
21 #include <linux/cache.h>
22 #include <linux/sched/sysctl.h>
23 #include <linux/sched/topology.h>
24 #include <linux/sched/signal.h>
25 #include <linux/delay.h>
26 #include <linux/crash_dump.h>
27 #include <linux/prefetch.h>
28 
29 #include <trace/events/block.h>
30 
31 #include <linux/blk-mq.h>
32 #include "blk.h"
33 #include "blk-mq.h"
34 #include "blk-mq-debugfs.h"
35 #include "blk-mq-tag.h"
36 #include "blk-stat.h"
37 #include "blk-wbt.h"
38 #include "blk-mq-sched.h"
39 
40 static void blk_mq_poll_stats_start(struct request_queue *q);
41 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
42 
43 static int blk_mq_poll_stats_bkt(const struct request *rq)
44 {
45 	int ddir, bytes, bucket;
46 
47 	ddir = rq_data_dir(rq);
48 	bytes = blk_rq_bytes(rq);
49 
50 	bucket = ddir + 2*(ilog2(bytes) - 9);
51 
52 	if (bucket < 0)
53 		return -1;
54 	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
55 		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
56 
57 	return bucket;
58 }
59 
60 /*
61  * Check if any of the ctx's have pending work in this hardware queue
62  */
63 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
64 {
65 	return sbitmap_any_bit_set(&hctx->ctx_map) ||
66 			!list_empty_careful(&hctx->dispatch) ||
67 			blk_mq_sched_has_work(hctx);
68 }
69 
70 /*
71  * Mark this ctx as having pending work in this hardware queue
72  */
73 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
74 				     struct blk_mq_ctx *ctx)
75 {
76 	if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
77 		sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
78 }
79 
80 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
81 				      struct blk_mq_ctx *ctx)
82 {
83 	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
84 }
85 
86 void blk_freeze_queue_start(struct request_queue *q)
87 {
88 	int freeze_depth;
89 
90 	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
91 	if (freeze_depth == 1) {
92 		percpu_ref_kill(&q->q_usage_counter);
93 		blk_mq_run_hw_queues(q, false);
94 	}
95 }
96 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
97 
98 void blk_mq_freeze_queue_wait(struct request_queue *q)
99 {
100 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
101 }
102 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
103 
104 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
105 				     unsigned long timeout)
106 {
107 	return wait_event_timeout(q->mq_freeze_wq,
108 					percpu_ref_is_zero(&q->q_usage_counter),
109 					timeout);
110 }
111 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
112 
113 /*
114  * Guarantee no request is in use, so we can change any data structure of
115  * the queue afterward.
116  */
117 void blk_freeze_queue(struct request_queue *q)
118 {
119 	/*
120 	 * In the !blk_mq case we are only calling this to kill the
121 	 * q_usage_counter, otherwise this increases the freeze depth
122 	 * and waits for it to return to zero.  For this reason there is
123 	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
124 	 * exported to drivers as the only user for unfreeze is blk_mq.
125 	 */
126 	blk_freeze_queue_start(q);
127 	blk_mq_freeze_queue_wait(q);
128 }
129 
130 void blk_mq_freeze_queue(struct request_queue *q)
131 {
132 	/*
133 	 * ...just an alias to keep freeze and unfreeze actions balanced
134 	 * in the blk_mq_* namespace
135 	 */
136 	blk_freeze_queue(q);
137 }
138 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
139 
140 void blk_mq_unfreeze_queue(struct request_queue *q)
141 {
142 	int freeze_depth;
143 
144 	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
145 	WARN_ON_ONCE(freeze_depth < 0);
146 	if (!freeze_depth) {
147 		percpu_ref_reinit(&q->q_usage_counter);
148 		wake_up_all(&q->mq_freeze_wq);
149 	}
150 }
151 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
152 
153 /*
154  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
155  * mpt3sas driver such that this function can be removed.
156  */
157 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
158 {
159 	unsigned long flags;
160 
161 	spin_lock_irqsave(q->queue_lock, flags);
162 	queue_flag_set(QUEUE_FLAG_QUIESCED, q);
163 	spin_unlock_irqrestore(q->queue_lock, flags);
164 }
165 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
166 
167 /**
168  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
169  * @q: request queue.
170  *
171  * Note: this function does not prevent that the struct request end_io()
172  * callback function is invoked. Once this function is returned, we make
173  * sure no dispatch can happen until the queue is unquiesced via
174  * blk_mq_unquiesce_queue().
175  */
176 void blk_mq_quiesce_queue(struct request_queue *q)
177 {
178 	struct blk_mq_hw_ctx *hctx;
179 	unsigned int i;
180 	bool rcu = false;
181 
182 	blk_mq_quiesce_queue_nowait(q);
183 
184 	queue_for_each_hw_ctx(q, hctx, i) {
185 		if (hctx->flags & BLK_MQ_F_BLOCKING)
186 			synchronize_srcu(hctx->queue_rq_srcu);
187 		else
188 			rcu = true;
189 	}
190 	if (rcu)
191 		synchronize_rcu();
192 }
193 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
194 
195 /*
196  * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
197  * @q: request queue.
198  *
199  * This function recovers queue into the state before quiescing
200  * which is done by blk_mq_quiesce_queue.
201  */
202 void blk_mq_unquiesce_queue(struct request_queue *q)
203 {
204 	unsigned long flags;
205 
206 	spin_lock_irqsave(q->queue_lock, flags);
207 	queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
208 	spin_unlock_irqrestore(q->queue_lock, flags);
209 
210 	/* dispatch requests which are inserted during quiescing */
211 	blk_mq_run_hw_queues(q, true);
212 }
213 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
214 
215 void blk_mq_wake_waiters(struct request_queue *q)
216 {
217 	struct blk_mq_hw_ctx *hctx;
218 	unsigned int i;
219 
220 	queue_for_each_hw_ctx(q, hctx, i)
221 		if (blk_mq_hw_queue_mapped(hctx))
222 			blk_mq_tag_wakeup_all(hctx->tags, true);
223 
224 	/*
225 	 * If we are called because the queue has now been marked as
226 	 * dying, we need to ensure that processes currently waiting on
227 	 * the queue are notified as well.
228 	 */
229 	wake_up_all(&q->mq_freeze_wq);
230 }
231 
232 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
233 {
234 	return blk_mq_has_free_tags(hctx->tags);
235 }
236 EXPORT_SYMBOL(blk_mq_can_queue);
237 
238 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
239 		unsigned int tag, unsigned int op)
240 {
241 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
242 	struct request *rq = tags->static_rqs[tag];
243 
244 	rq->rq_flags = 0;
245 
246 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
247 		rq->tag = -1;
248 		rq->internal_tag = tag;
249 	} else {
250 		if (blk_mq_tag_busy(data->hctx)) {
251 			rq->rq_flags = RQF_MQ_INFLIGHT;
252 			atomic_inc(&data->hctx->nr_active);
253 		}
254 		rq->tag = tag;
255 		rq->internal_tag = -1;
256 		data->hctx->tags->rqs[rq->tag] = rq;
257 	}
258 
259 	INIT_LIST_HEAD(&rq->queuelist);
260 	/* csd/requeue_work/fifo_time is initialized before use */
261 	rq->q = data->q;
262 	rq->mq_ctx = data->ctx;
263 	rq->cmd_flags = op;
264 	if (blk_queue_io_stat(data->q))
265 		rq->rq_flags |= RQF_IO_STAT;
266 	/* do not touch atomic flags, it needs atomic ops against the timer */
267 	rq->cpu = -1;
268 	INIT_HLIST_NODE(&rq->hash);
269 	RB_CLEAR_NODE(&rq->rb_node);
270 	rq->rq_disk = NULL;
271 	rq->part = NULL;
272 	rq->start_time = jiffies;
273 #ifdef CONFIG_BLK_CGROUP
274 	rq->rl = NULL;
275 	set_start_time_ns(rq);
276 	rq->io_start_time_ns = 0;
277 #endif
278 	rq->nr_phys_segments = 0;
279 #if defined(CONFIG_BLK_DEV_INTEGRITY)
280 	rq->nr_integrity_segments = 0;
281 #endif
282 	rq->special = NULL;
283 	/* tag was already set */
284 	rq->extra_len = 0;
285 
286 	INIT_LIST_HEAD(&rq->timeout_list);
287 	rq->timeout = 0;
288 
289 	rq->end_io = NULL;
290 	rq->end_io_data = NULL;
291 	rq->next_rq = NULL;
292 
293 	data->ctx->rq_dispatched[op_is_sync(op)]++;
294 	return rq;
295 }
296 
297 static struct request *blk_mq_get_request(struct request_queue *q,
298 		struct bio *bio, unsigned int op,
299 		struct blk_mq_alloc_data *data)
300 {
301 	struct elevator_queue *e = q->elevator;
302 	struct request *rq;
303 	unsigned int tag;
304 	struct blk_mq_ctx *local_ctx = NULL;
305 
306 	blk_queue_enter_live(q);
307 	data->q = q;
308 	if (likely(!data->ctx))
309 		data->ctx = local_ctx = blk_mq_get_ctx(q);
310 	if (likely(!data->hctx))
311 		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
312 	if (op & REQ_NOWAIT)
313 		data->flags |= BLK_MQ_REQ_NOWAIT;
314 
315 	if (e) {
316 		data->flags |= BLK_MQ_REQ_INTERNAL;
317 
318 		/*
319 		 * Flush requests are special and go directly to the
320 		 * dispatch list.
321 		 */
322 		if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
323 			e->type->ops.mq.limit_depth(op, data);
324 	}
325 
326 	tag = blk_mq_get_tag(data);
327 	if (tag == BLK_MQ_TAG_FAIL) {
328 		if (local_ctx) {
329 			blk_mq_put_ctx(local_ctx);
330 			data->ctx = NULL;
331 		}
332 		blk_queue_exit(q);
333 		return NULL;
334 	}
335 
336 	rq = blk_mq_rq_ctx_init(data, tag, op);
337 	if (!op_is_flush(op)) {
338 		rq->elv.icq = NULL;
339 		if (e && e->type->ops.mq.prepare_request) {
340 			if (e->type->icq_cache && rq_ioc(bio))
341 				blk_mq_sched_assign_ioc(rq, bio);
342 
343 			e->type->ops.mq.prepare_request(rq, bio);
344 			rq->rq_flags |= RQF_ELVPRIV;
345 		}
346 	}
347 	data->hctx->queued++;
348 	return rq;
349 }
350 
351 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
352 		unsigned int flags)
353 {
354 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
355 	struct request *rq;
356 	int ret;
357 
358 	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
359 	if (ret)
360 		return ERR_PTR(ret);
361 
362 	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
363 
364 	if (!rq)
365 		return ERR_PTR(-EWOULDBLOCK);
366 
367 	blk_mq_put_ctx(alloc_data.ctx);
368 	blk_queue_exit(q);
369 
370 	rq->__data_len = 0;
371 	rq->__sector = (sector_t) -1;
372 	rq->bio = rq->biotail = NULL;
373 	return rq;
374 }
375 EXPORT_SYMBOL(blk_mq_alloc_request);
376 
377 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
378 		unsigned int op, unsigned int flags, unsigned int hctx_idx)
379 {
380 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
381 	struct request *rq;
382 	unsigned int cpu;
383 	int ret;
384 
385 	/*
386 	 * If the tag allocator sleeps we could get an allocation for a
387 	 * different hardware context.  No need to complicate the low level
388 	 * allocator for this for the rare use case of a command tied to
389 	 * a specific queue.
390 	 */
391 	if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
392 		return ERR_PTR(-EINVAL);
393 
394 	if (hctx_idx >= q->nr_hw_queues)
395 		return ERR_PTR(-EIO);
396 
397 	ret = blk_queue_enter(q, true);
398 	if (ret)
399 		return ERR_PTR(ret);
400 
401 	/*
402 	 * Check if the hardware context is actually mapped to anything.
403 	 * If not tell the caller that it should skip this queue.
404 	 */
405 	alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
406 	if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
407 		blk_queue_exit(q);
408 		return ERR_PTR(-EXDEV);
409 	}
410 	cpu = cpumask_first(alloc_data.hctx->cpumask);
411 	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
412 
413 	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
414 
415 	if (!rq)
416 		return ERR_PTR(-EWOULDBLOCK);
417 
418 	blk_queue_exit(q);
419 
420 	return rq;
421 }
422 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
423 
424 void blk_mq_free_request(struct request *rq)
425 {
426 	struct request_queue *q = rq->q;
427 	struct elevator_queue *e = q->elevator;
428 	struct blk_mq_ctx *ctx = rq->mq_ctx;
429 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
430 	const int sched_tag = rq->internal_tag;
431 
432 	if (rq->rq_flags & RQF_ELVPRIV) {
433 		if (e && e->type->ops.mq.finish_request)
434 			e->type->ops.mq.finish_request(rq);
435 		if (rq->elv.icq) {
436 			put_io_context(rq->elv.icq->ioc);
437 			rq->elv.icq = NULL;
438 		}
439 	}
440 
441 	ctx->rq_completed[rq_is_sync(rq)]++;
442 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
443 		atomic_dec(&hctx->nr_active);
444 
445 	wbt_done(q->rq_wb, &rq->issue_stat);
446 
447 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
448 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
449 	if (rq->tag != -1)
450 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
451 	if (sched_tag != -1)
452 		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
453 	blk_mq_sched_restart(hctx);
454 	blk_queue_exit(q);
455 }
456 EXPORT_SYMBOL_GPL(blk_mq_free_request);
457 
458 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
459 {
460 	blk_account_io_done(rq);
461 
462 	if (rq->end_io) {
463 		wbt_done(rq->q->rq_wb, &rq->issue_stat);
464 		rq->end_io(rq, error);
465 	} else {
466 		if (unlikely(blk_bidi_rq(rq)))
467 			blk_mq_free_request(rq->next_rq);
468 		blk_mq_free_request(rq);
469 	}
470 }
471 EXPORT_SYMBOL(__blk_mq_end_request);
472 
473 void blk_mq_end_request(struct request *rq, blk_status_t error)
474 {
475 	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
476 		BUG();
477 	__blk_mq_end_request(rq, error);
478 }
479 EXPORT_SYMBOL(blk_mq_end_request);
480 
481 static void __blk_mq_complete_request_remote(void *data)
482 {
483 	struct request *rq = data;
484 
485 	rq->q->softirq_done_fn(rq);
486 }
487 
488 static void __blk_mq_complete_request(struct request *rq)
489 {
490 	struct blk_mq_ctx *ctx = rq->mq_ctx;
491 	bool shared = false;
492 	int cpu;
493 
494 	if (rq->internal_tag != -1)
495 		blk_mq_sched_completed_request(rq);
496 	if (rq->rq_flags & RQF_STATS) {
497 		blk_mq_poll_stats_start(rq->q);
498 		blk_stat_add(rq);
499 	}
500 
501 	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
502 		rq->q->softirq_done_fn(rq);
503 		return;
504 	}
505 
506 	cpu = get_cpu();
507 	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
508 		shared = cpus_share_cache(cpu, ctx->cpu);
509 
510 	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
511 		rq->csd.func = __blk_mq_complete_request_remote;
512 		rq->csd.info = rq;
513 		rq->csd.flags = 0;
514 		smp_call_function_single_async(ctx->cpu, &rq->csd);
515 	} else {
516 		rq->q->softirq_done_fn(rq);
517 	}
518 	put_cpu();
519 }
520 
521 /**
522  * blk_mq_complete_request - end I/O on a request
523  * @rq:		the request being processed
524  *
525  * Description:
526  *	Ends all I/O on a request. It does not handle partial completions.
527  *	The actual completion happens out-of-order, through a IPI handler.
528  **/
529 void blk_mq_complete_request(struct request *rq)
530 {
531 	struct request_queue *q = rq->q;
532 
533 	if (unlikely(blk_should_fake_timeout(q)))
534 		return;
535 	if (!blk_mark_rq_complete(rq))
536 		__blk_mq_complete_request(rq);
537 }
538 EXPORT_SYMBOL(blk_mq_complete_request);
539 
540 int blk_mq_request_started(struct request *rq)
541 {
542 	return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
543 }
544 EXPORT_SYMBOL_GPL(blk_mq_request_started);
545 
546 void blk_mq_start_request(struct request *rq)
547 {
548 	struct request_queue *q = rq->q;
549 
550 	blk_mq_sched_started_request(rq);
551 
552 	trace_block_rq_issue(q, rq);
553 
554 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
555 		blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
556 		rq->rq_flags |= RQF_STATS;
557 		wbt_issue(q->rq_wb, &rq->issue_stat);
558 	}
559 
560 	blk_add_timer(rq);
561 
562 	/*
563 	 * Ensure that ->deadline is visible before set the started
564 	 * flag and clear the completed flag.
565 	 */
566 	smp_mb__before_atomic();
567 
568 	/*
569 	 * Mark us as started and clear complete. Complete might have been
570 	 * set if requeue raced with timeout, which then marked it as
571 	 * complete. So be sure to clear complete again when we start
572 	 * the request, otherwise we'll ignore the completion event.
573 	 */
574 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
575 		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
576 	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
577 		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
578 
579 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
580 		/*
581 		 * Make sure space for the drain appears.  We know we can do
582 		 * this because max_hw_segments has been adjusted to be one
583 		 * fewer than the device can handle.
584 		 */
585 		rq->nr_phys_segments++;
586 	}
587 }
588 EXPORT_SYMBOL(blk_mq_start_request);
589 
590 /*
591  * When we reach here because queue is busy, REQ_ATOM_COMPLETE
592  * flag isn't set yet, so there may be race with timeout handler,
593  * but given rq->deadline is just set in .queue_rq() under
594  * this situation, the race won't be possible in reality because
595  * rq->timeout should be set as big enough to cover the window
596  * between blk_mq_start_request() called from .queue_rq() and
597  * clearing REQ_ATOM_STARTED here.
598  */
599 static void __blk_mq_requeue_request(struct request *rq)
600 {
601 	struct request_queue *q = rq->q;
602 
603 	trace_block_rq_requeue(q, rq);
604 	wbt_requeue(q->rq_wb, &rq->issue_stat);
605 	blk_mq_sched_requeue_request(rq);
606 
607 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
608 		if (q->dma_drain_size && blk_rq_bytes(rq))
609 			rq->nr_phys_segments--;
610 	}
611 }
612 
613 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
614 {
615 	__blk_mq_requeue_request(rq);
616 
617 	BUG_ON(blk_queued_rq(rq));
618 	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
619 }
620 EXPORT_SYMBOL(blk_mq_requeue_request);
621 
622 static void blk_mq_requeue_work(struct work_struct *work)
623 {
624 	struct request_queue *q =
625 		container_of(work, struct request_queue, requeue_work.work);
626 	LIST_HEAD(rq_list);
627 	struct request *rq, *next;
628 	unsigned long flags;
629 
630 	spin_lock_irqsave(&q->requeue_lock, flags);
631 	list_splice_init(&q->requeue_list, &rq_list);
632 	spin_unlock_irqrestore(&q->requeue_lock, flags);
633 
634 	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
635 		if (!(rq->rq_flags & RQF_SOFTBARRIER))
636 			continue;
637 
638 		rq->rq_flags &= ~RQF_SOFTBARRIER;
639 		list_del_init(&rq->queuelist);
640 		blk_mq_sched_insert_request(rq, true, false, false, true);
641 	}
642 
643 	while (!list_empty(&rq_list)) {
644 		rq = list_entry(rq_list.next, struct request, queuelist);
645 		list_del_init(&rq->queuelist);
646 		blk_mq_sched_insert_request(rq, false, false, false, true);
647 	}
648 
649 	blk_mq_run_hw_queues(q, false);
650 }
651 
652 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
653 				bool kick_requeue_list)
654 {
655 	struct request_queue *q = rq->q;
656 	unsigned long flags;
657 
658 	/*
659 	 * We abuse this flag that is otherwise used by the I/O scheduler to
660 	 * request head insertation from the workqueue.
661 	 */
662 	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
663 
664 	spin_lock_irqsave(&q->requeue_lock, flags);
665 	if (at_head) {
666 		rq->rq_flags |= RQF_SOFTBARRIER;
667 		list_add(&rq->queuelist, &q->requeue_list);
668 	} else {
669 		list_add_tail(&rq->queuelist, &q->requeue_list);
670 	}
671 	spin_unlock_irqrestore(&q->requeue_lock, flags);
672 
673 	if (kick_requeue_list)
674 		blk_mq_kick_requeue_list(q);
675 }
676 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
677 
678 void blk_mq_kick_requeue_list(struct request_queue *q)
679 {
680 	kblockd_schedule_delayed_work(&q->requeue_work, 0);
681 }
682 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
683 
684 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
685 				    unsigned long msecs)
686 {
687 	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
688 				    msecs_to_jiffies(msecs));
689 }
690 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
691 
692 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
693 {
694 	if (tag < tags->nr_tags) {
695 		prefetch(tags->rqs[tag]);
696 		return tags->rqs[tag];
697 	}
698 
699 	return NULL;
700 }
701 EXPORT_SYMBOL(blk_mq_tag_to_rq);
702 
703 struct blk_mq_timeout_data {
704 	unsigned long next;
705 	unsigned int next_set;
706 };
707 
708 void blk_mq_rq_timed_out(struct request *req, bool reserved)
709 {
710 	const struct blk_mq_ops *ops = req->q->mq_ops;
711 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
712 
713 	/*
714 	 * We know that complete is set at this point. If STARTED isn't set
715 	 * anymore, then the request isn't active and the "timeout" should
716 	 * just be ignored. This can happen due to the bitflag ordering.
717 	 * Timeout first checks if STARTED is set, and if it is, assumes
718 	 * the request is active. But if we race with completion, then
719 	 * both flags will get cleared. So check here again, and ignore
720 	 * a timeout event with a request that isn't active.
721 	 */
722 	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
723 		return;
724 
725 	if (ops->timeout)
726 		ret = ops->timeout(req, reserved);
727 
728 	switch (ret) {
729 	case BLK_EH_HANDLED:
730 		__blk_mq_complete_request(req);
731 		break;
732 	case BLK_EH_RESET_TIMER:
733 		blk_add_timer(req);
734 		blk_clear_rq_complete(req);
735 		break;
736 	case BLK_EH_NOT_HANDLED:
737 		break;
738 	default:
739 		printk(KERN_ERR "block: bad eh return: %d\n", ret);
740 		break;
741 	}
742 }
743 
744 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
745 		struct request *rq, void *priv, bool reserved)
746 {
747 	struct blk_mq_timeout_data *data = priv;
748 
749 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
750 		return;
751 
752 	/*
753 	 * The rq being checked may have been freed and reallocated
754 	 * out already here, we avoid this race by checking rq->deadline
755 	 * and REQ_ATOM_COMPLETE flag together:
756 	 *
757 	 * - if rq->deadline is observed as new value because of
758 	 *   reusing, the rq won't be timed out because of timing.
759 	 * - if rq->deadline is observed as previous value,
760 	 *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
761 	 *   because we put a barrier between setting rq->deadline
762 	 *   and clearing the flag in blk_mq_start_request(), so
763 	 *   this rq won't be timed out too.
764 	 */
765 	if (time_after_eq(jiffies, rq->deadline)) {
766 		if (!blk_mark_rq_complete(rq))
767 			blk_mq_rq_timed_out(rq, reserved);
768 	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
769 		data->next = rq->deadline;
770 		data->next_set = 1;
771 	}
772 }
773 
774 static void blk_mq_timeout_work(struct work_struct *work)
775 {
776 	struct request_queue *q =
777 		container_of(work, struct request_queue, timeout_work);
778 	struct blk_mq_timeout_data data = {
779 		.next		= 0,
780 		.next_set	= 0,
781 	};
782 	int i;
783 
784 	/* A deadlock might occur if a request is stuck requiring a
785 	 * timeout at the same time a queue freeze is waiting
786 	 * completion, since the timeout code would not be able to
787 	 * acquire the queue reference here.
788 	 *
789 	 * That's why we don't use blk_queue_enter here; instead, we use
790 	 * percpu_ref_tryget directly, because we need to be able to
791 	 * obtain a reference even in the short window between the queue
792 	 * starting to freeze, by dropping the first reference in
793 	 * blk_freeze_queue_start, and the moment the last request is
794 	 * consumed, marked by the instant q_usage_counter reaches
795 	 * zero.
796 	 */
797 	if (!percpu_ref_tryget(&q->q_usage_counter))
798 		return;
799 
800 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
801 
802 	if (data.next_set) {
803 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
804 		mod_timer(&q->timeout, data.next);
805 	} else {
806 		struct blk_mq_hw_ctx *hctx;
807 
808 		queue_for_each_hw_ctx(q, hctx, i) {
809 			/* the hctx may be unmapped, so check it here */
810 			if (blk_mq_hw_queue_mapped(hctx))
811 				blk_mq_tag_idle(hctx);
812 		}
813 	}
814 	blk_queue_exit(q);
815 }
816 
817 struct flush_busy_ctx_data {
818 	struct blk_mq_hw_ctx *hctx;
819 	struct list_head *list;
820 };
821 
822 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
823 {
824 	struct flush_busy_ctx_data *flush_data = data;
825 	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
826 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
827 
828 	sbitmap_clear_bit(sb, bitnr);
829 	spin_lock(&ctx->lock);
830 	list_splice_tail_init(&ctx->rq_list, flush_data->list);
831 	spin_unlock(&ctx->lock);
832 	return true;
833 }
834 
835 /*
836  * Process software queues that have been marked busy, splicing them
837  * to the for-dispatch
838  */
839 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
840 {
841 	struct flush_busy_ctx_data data = {
842 		.hctx = hctx,
843 		.list = list,
844 	};
845 
846 	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
847 }
848 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
849 
850 static inline unsigned int queued_to_index(unsigned int queued)
851 {
852 	if (!queued)
853 		return 0;
854 
855 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
856 }
857 
858 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
859 			   bool wait)
860 {
861 	struct blk_mq_alloc_data data = {
862 		.q = rq->q,
863 		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
864 		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
865 	};
866 
867 	might_sleep_if(wait);
868 
869 	if (rq->tag != -1)
870 		goto done;
871 
872 	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
873 		data.flags |= BLK_MQ_REQ_RESERVED;
874 
875 	rq->tag = blk_mq_get_tag(&data);
876 	if (rq->tag >= 0) {
877 		if (blk_mq_tag_busy(data.hctx)) {
878 			rq->rq_flags |= RQF_MQ_INFLIGHT;
879 			atomic_inc(&data.hctx->nr_active);
880 		}
881 		data.hctx->tags->rqs[rq->tag] = rq;
882 	}
883 
884 done:
885 	if (hctx)
886 		*hctx = data.hctx;
887 	return rq->tag != -1;
888 }
889 
890 static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
891 				    struct request *rq)
892 {
893 	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
894 	rq->tag = -1;
895 
896 	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
897 		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
898 		atomic_dec(&hctx->nr_active);
899 	}
900 }
901 
902 static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
903 				       struct request *rq)
904 {
905 	if (rq->tag == -1 || rq->internal_tag == -1)
906 		return;
907 
908 	__blk_mq_put_driver_tag(hctx, rq);
909 }
910 
911 static void blk_mq_put_driver_tag(struct request *rq)
912 {
913 	struct blk_mq_hw_ctx *hctx;
914 
915 	if (rq->tag == -1 || rq->internal_tag == -1)
916 		return;
917 
918 	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
919 	__blk_mq_put_driver_tag(hctx, rq);
920 }
921 
922 /*
923  * If we fail getting a driver tag because all the driver tags are already
924  * assigned and on the dispatch list, BUT the first entry does not have a
925  * tag, then we could deadlock. For that case, move entries with assigned
926  * driver tags to the front, leaving the set of tagged requests in the
927  * same order, and the untagged set in the same order.
928  */
929 static bool reorder_tags_to_front(struct list_head *list)
930 {
931 	struct request *rq, *tmp, *first = NULL;
932 
933 	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
934 		if (rq == first)
935 			break;
936 		if (rq->tag != -1) {
937 			list_move(&rq->queuelist, list);
938 			if (!first)
939 				first = rq;
940 		}
941 	}
942 
943 	return first != NULL;
944 }
945 
946 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
947 				void *key)
948 {
949 	struct blk_mq_hw_ctx *hctx;
950 
951 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
952 
953 	list_del(&wait->entry);
954 	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
955 	blk_mq_run_hw_queue(hctx, true);
956 	return 1;
957 }
958 
959 static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
960 {
961 	struct sbq_wait_state *ws;
962 
963 	/*
964 	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
965 	 * The thread which wins the race to grab this bit adds the hardware
966 	 * queue to the wait queue.
967 	 */
968 	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
969 	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
970 		return false;
971 
972 	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
973 	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
974 
975 	/*
976 	 * As soon as this returns, it's no longer safe to fiddle with
977 	 * hctx->dispatch_wait, since a completion can wake up the wait queue
978 	 * and unlock the bit.
979 	 */
980 	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
981 	return true;
982 }
983 
984 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
985 {
986 	struct blk_mq_hw_ctx *hctx;
987 	struct request *rq;
988 	int errors, queued;
989 
990 	if (list_empty(list))
991 		return false;
992 
993 	/*
994 	 * Now process all the entries, sending them to the driver.
995 	 */
996 	errors = queued = 0;
997 	do {
998 		struct blk_mq_queue_data bd;
999 		blk_status_t ret;
1000 
1001 		rq = list_first_entry(list, struct request, queuelist);
1002 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1003 			if (!queued && reorder_tags_to_front(list))
1004 				continue;
1005 
1006 			/*
1007 			 * The initial allocation attempt failed, so we need to
1008 			 * rerun the hardware queue when a tag is freed.
1009 			 */
1010 			if (!blk_mq_dispatch_wait_add(hctx))
1011 				break;
1012 
1013 			/*
1014 			 * It's possible that a tag was freed in the window
1015 			 * between the allocation failure and adding the
1016 			 * hardware queue to the wait queue.
1017 			 */
1018 			if (!blk_mq_get_driver_tag(rq, &hctx, false))
1019 				break;
1020 		}
1021 
1022 		list_del_init(&rq->queuelist);
1023 
1024 		bd.rq = rq;
1025 
1026 		/*
1027 		 * Flag last if we have no more requests, or if we have more
1028 		 * but can't assign a driver tag to it.
1029 		 */
1030 		if (list_empty(list))
1031 			bd.last = true;
1032 		else {
1033 			struct request *nxt;
1034 
1035 			nxt = list_first_entry(list, struct request, queuelist);
1036 			bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
1037 		}
1038 
1039 		ret = q->mq_ops->queue_rq(hctx, &bd);
1040 		if (ret == BLK_STS_RESOURCE) {
1041 			blk_mq_put_driver_tag_hctx(hctx, rq);
1042 			list_add(&rq->queuelist, list);
1043 			__blk_mq_requeue_request(rq);
1044 			break;
1045 		}
1046 
1047 		if (unlikely(ret != BLK_STS_OK)) {
1048 			errors++;
1049 			blk_mq_end_request(rq, BLK_STS_IOERR);
1050 			continue;
1051 		}
1052 
1053 		queued++;
1054 	} while (!list_empty(list));
1055 
1056 	hctx->dispatched[queued_to_index(queued)]++;
1057 
1058 	/*
1059 	 * Any items that need requeuing? Stuff them into hctx->dispatch,
1060 	 * that is where we will continue on next queue run.
1061 	 */
1062 	if (!list_empty(list)) {
1063 		/*
1064 		 * If an I/O scheduler has been configured and we got a driver
1065 		 * tag for the next request already, free it again.
1066 		 */
1067 		rq = list_first_entry(list, struct request, queuelist);
1068 		blk_mq_put_driver_tag(rq);
1069 
1070 		spin_lock(&hctx->lock);
1071 		list_splice_init(list, &hctx->dispatch);
1072 		spin_unlock(&hctx->lock);
1073 
1074 		/*
1075 		 * If SCHED_RESTART was set by the caller of this function and
1076 		 * it is no longer set that means that it was cleared by another
1077 		 * thread and hence that a queue rerun is needed.
1078 		 *
1079 		 * If TAG_WAITING is set that means that an I/O scheduler has
1080 		 * been configured and another thread is waiting for a driver
1081 		 * tag. To guarantee fairness, do not rerun this hardware queue
1082 		 * but let the other thread grab the driver tag.
1083 		 *
1084 		 * If no I/O scheduler has been configured it is possible that
1085 		 * the hardware queue got stopped and restarted before requests
1086 		 * were pushed back onto the dispatch list. Rerun the queue to
1087 		 * avoid starvation. Notes:
1088 		 * - blk_mq_run_hw_queue() checks whether or not a queue has
1089 		 *   been stopped before rerunning a queue.
1090 		 * - Some but not all block drivers stop a queue before
1091 		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1092 		 *   and dm-rq.
1093 		 */
1094 		if (!blk_mq_sched_needs_restart(hctx) &&
1095 		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
1096 			blk_mq_run_hw_queue(hctx, true);
1097 	}
1098 
1099 	return (queued + errors) != 0;
1100 }
1101 
1102 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1103 {
1104 	int srcu_idx;
1105 
1106 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1107 		cpu_online(hctx->next_cpu));
1108 
1109 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1110 		rcu_read_lock();
1111 		blk_mq_sched_dispatch_requests(hctx);
1112 		rcu_read_unlock();
1113 	} else {
1114 		might_sleep();
1115 
1116 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
1117 		blk_mq_sched_dispatch_requests(hctx);
1118 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
1119 	}
1120 }
1121 
1122 /*
1123  * It'd be great if the workqueue API had a way to pass
1124  * in a mask and had some smarts for more clever placement.
1125  * For now we just round-robin here, switching for every
1126  * BLK_MQ_CPU_WORK_BATCH queued items.
1127  */
1128 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1129 {
1130 	if (hctx->queue->nr_hw_queues == 1)
1131 		return WORK_CPU_UNBOUND;
1132 
1133 	if (--hctx->next_cpu_batch <= 0) {
1134 		int next_cpu;
1135 
1136 		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
1137 		if (next_cpu >= nr_cpu_ids)
1138 			next_cpu = cpumask_first(hctx->cpumask);
1139 
1140 		hctx->next_cpu = next_cpu;
1141 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1142 	}
1143 
1144 	return hctx->next_cpu;
1145 }
1146 
1147 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1148 					unsigned long msecs)
1149 {
1150 	if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
1151 		return;
1152 
1153 	if (unlikely(blk_mq_hctx_stopped(hctx)))
1154 		return;
1155 
1156 	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1157 		int cpu = get_cpu();
1158 		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1159 			__blk_mq_run_hw_queue(hctx);
1160 			put_cpu();
1161 			return;
1162 		}
1163 
1164 		put_cpu();
1165 	}
1166 
1167 	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1168 					 &hctx->run_work,
1169 					 msecs_to_jiffies(msecs));
1170 }
1171 
1172 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1173 {
1174 	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
1175 }
1176 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1177 
1178 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1179 {
1180 	__blk_mq_delay_run_hw_queue(hctx, async, 0);
1181 }
1182 EXPORT_SYMBOL(blk_mq_run_hw_queue);
1183 
1184 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1185 {
1186 	struct blk_mq_hw_ctx *hctx;
1187 	int i;
1188 
1189 	queue_for_each_hw_ctx(q, hctx, i) {
1190 		if (!blk_mq_hctx_has_pending(hctx) ||
1191 		    blk_mq_hctx_stopped(hctx))
1192 			continue;
1193 
1194 		blk_mq_run_hw_queue(hctx, async);
1195 	}
1196 }
1197 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1198 
1199 /**
1200  * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1201  * @q: request queue.
1202  *
1203  * The caller is responsible for serializing this function against
1204  * blk_mq_{start,stop}_hw_queue().
1205  */
1206 bool blk_mq_queue_stopped(struct request_queue *q)
1207 {
1208 	struct blk_mq_hw_ctx *hctx;
1209 	int i;
1210 
1211 	queue_for_each_hw_ctx(q, hctx, i)
1212 		if (blk_mq_hctx_stopped(hctx))
1213 			return true;
1214 
1215 	return false;
1216 }
1217 EXPORT_SYMBOL(blk_mq_queue_stopped);
1218 
1219 /*
1220  * This function is often used for pausing .queue_rq() by driver when
1221  * there isn't enough resource or some conditions aren't satisfied, and
1222  * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
1223  *
1224  * We do not guarantee that dispatch can be drained or blocked
1225  * after blk_mq_stop_hw_queue() returns. Please use
1226  * blk_mq_quiesce_queue() for that requirement.
1227  */
1228 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1229 {
1230 	cancel_delayed_work(&hctx->run_work);
1231 
1232 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1233 }
1234 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1235 
1236 /*
1237  * This function is often used for pausing .queue_rq() by driver when
1238  * there isn't enough resource or some conditions aren't satisfied, and
1239  * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
1240  *
1241  * We do not guarantee that dispatch can be drained or blocked
1242  * after blk_mq_stop_hw_queues() returns. Please use
1243  * blk_mq_quiesce_queue() for that requirement.
1244  */
1245 void blk_mq_stop_hw_queues(struct request_queue *q)
1246 {
1247 	struct blk_mq_hw_ctx *hctx;
1248 	int i;
1249 
1250 	queue_for_each_hw_ctx(q, hctx, i)
1251 		blk_mq_stop_hw_queue(hctx);
1252 }
1253 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1254 
1255 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1256 {
1257 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1258 
1259 	blk_mq_run_hw_queue(hctx, false);
1260 }
1261 EXPORT_SYMBOL(blk_mq_start_hw_queue);
1262 
1263 void blk_mq_start_hw_queues(struct request_queue *q)
1264 {
1265 	struct blk_mq_hw_ctx *hctx;
1266 	int i;
1267 
1268 	queue_for_each_hw_ctx(q, hctx, i)
1269 		blk_mq_start_hw_queue(hctx);
1270 }
1271 EXPORT_SYMBOL(blk_mq_start_hw_queues);
1272 
1273 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1274 {
1275 	if (!blk_mq_hctx_stopped(hctx))
1276 		return;
1277 
1278 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1279 	blk_mq_run_hw_queue(hctx, async);
1280 }
1281 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
1282 
1283 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1284 {
1285 	struct blk_mq_hw_ctx *hctx;
1286 	int i;
1287 
1288 	queue_for_each_hw_ctx(q, hctx, i)
1289 		blk_mq_start_stopped_hw_queue(hctx, async);
1290 }
1291 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1292 
1293 static void blk_mq_run_work_fn(struct work_struct *work)
1294 {
1295 	struct blk_mq_hw_ctx *hctx;
1296 
1297 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1298 
1299 	/*
1300 	 * If we are stopped, don't run the queue. The exception is if
1301 	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
1302 	 * the STOPPED bit and run it.
1303 	 */
1304 	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
1305 		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
1306 			return;
1307 
1308 		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
1309 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1310 	}
1311 
1312 	__blk_mq_run_hw_queue(hctx);
1313 }
1314 
1315 
1316 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1317 {
1318 	if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
1319 		return;
1320 
1321 	/*
1322 	 * Stop the hw queue, then modify currently delayed work.
1323 	 * This should prevent us from running the queue prematurely.
1324 	 * Mark the queue as auto-clearing STOPPED when it runs.
1325 	 */
1326 	blk_mq_stop_hw_queue(hctx);
1327 	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
1328 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1329 					&hctx->run_work,
1330 					msecs_to_jiffies(msecs));
1331 }
1332 EXPORT_SYMBOL(blk_mq_delay_queue);
1333 
1334 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1335 					    struct request *rq,
1336 					    bool at_head)
1337 {
1338 	struct blk_mq_ctx *ctx = rq->mq_ctx;
1339 
1340 	lockdep_assert_held(&ctx->lock);
1341 
1342 	trace_block_rq_insert(hctx->queue, rq);
1343 
1344 	if (at_head)
1345 		list_add(&rq->queuelist, &ctx->rq_list);
1346 	else
1347 		list_add_tail(&rq->queuelist, &ctx->rq_list);
1348 }
1349 
1350 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1351 			     bool at_head)
1352 {
1353 	struct blk_mq_ctx *ctx = rq->mq_ctx;
1354 
1355 	lockdep_assert_held(&ctx->lock);
1356 
1357 	__blk_mq_insert_req_list(hctx, rq, at_head);
1358 	blk_mq_hctx_mark_pending(hctx, ctx);
1359 }
1360 
1361 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1362 			    struct list_head *list)
1363 
1364 {
1365 	/*
1366 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
1367 	 * offline now
1368 	 */
1369 	spin_lock(&ctx->lock);
1370 	while (!list_empty(list)) {
1371 		struct request *rq;
1372 
1373 		rq = list_first_entry(list, struct request, queuelist);
1374 		BUG_ON(rq->mq_ctx != ctx);
1375 		list_del_init(&rq->queuelist);
1376 		__blk_mq_insert_req_list(hctx, rq, false);
1377 	}
1378 	blk_mq_hctx_mark_pending(hctx, ctx);
1379 	spin_unlock(&ctx->lock);
1380 }
1381 
1382 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1383 {
1384 	struct request *rqa = container_of(a, struct request, queuelist);
1385 	struct request *rqb = container_of(b, struct request, queuelist);
1386 
1387 	return !(rqa->mq_ctx < rqb->mq_ctx ||
1388 		 (rqa->mq_ctx == rqb->mq_ctx &&
1389 		  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1390 }
1391 
1392 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1393 {
1394 	struct blk_mq_ctx *this_ctx;
1395 	struct request_queue *this_q;
1396 	struct request *rq;
1397 	LIST_HEAD(list);
1398 	LIST_HEAD(ctx_list);
1399 	unsigned int depth;
1400 
1401 	list_splice_init(&plug->mq_list, &list);
1402 
1403 	list_sort(NULL, &list, plug_ctx_cmp);
1404 
1405 	this_q = NULL;
1406 	this_ctx = NULL;
1407 	depth = 0;
1408 
1409 	while (!list_empty(&list)) {
1410 		rq = list_entry_rq(list.next);
1411 		list_del_init(&rq->queuelist);
1412 		BUG_ON(!rq->q);
1413 		if (rq->mq_ctx != this_ctx) {
1414 			if (this_ctx) {
1415 				trace_block_unplug(this_q, depth, from_schedule);
1416 				blk_mq_sched_insert_requests(this_q, this_ctx,
1417 								&ctx_list,
1418 								from_schedule);
1419 			}
1420 
1421 			this_ctx = rq->mq_ctx;
1422 			this_q = rq->q;
1423 			depth = 0;
1424 		}
1425 
1426 		depth++;
1427 		list_add_tail(&rq->queuelist, &ctx_list);
1428 	}
1429 
1430 	/*
1431 	 * If 'this_ctx' is set, we know we have entries to complete
1432 	 * on 'ctx_list'. Do those.
1433 	 */
1434 	if (this_ctx) {
1435 		trace_block_unplug(this_q, depth, from_schedule);
1436 		blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1437 						from_schedule);
1438 	}
1439 }
1440 
1441 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1442 {
1443 	blk_init_request_from_bio(rq, bio);
1444 
1445 	blk_account_io_start(rq, true);
1446 }
1447 
1448 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1449 {
1450 	return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1451 		!blk_queue_nomerges(hctx->queue);
1452 }
1453 
1454 static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
1455 				   struct blk_mq_ctx *ctx,
1456 				   struct request *rq)
1457 {
1458 	spin_lock(&ctx->lock);
1459 	__blk_mq_insert_request(hctx, rq, false);
1460 	spin_unlock(&ctx->lock);
1461 }
1462 
1463 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1464 {
1465 	if (rq->tag != -1)
1466 		return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1467 
1468 	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1469 }
1470 
1471 static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1472 					struct request *rq,
1473 					blk_qc_t *cookie, bool may_sleep)
1474 {
1475 	struct request_queue *q = rq->q;
1476 	struct blk_mq_queue_data bd = {
1477 		.rq = rq,
1478 		.last = true,
1479 	};
1480 	blk_qc_t new_cookie;
1481 	blk_status_t ret;
1482 	bool run_queue = true;
1483 
1484 	/* RCU or SRCU read lock is needed before checking quiesced flag */
1485 	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
1486 		run_queue = false;
1487 		goto insert;
1488 	}
1489 
1490 	if (q->elevator)
1491 		goto insert;
1492 
1493 	if (!blk_mq_get_driver_tag(rq, NULL, false))
1494 		goto insert;
1495 
1496 	new_cookie = request_to_qc_t(hctx, rq);
1497 
1498 	/*
1499 	 * For OK queue, we are done. For error, kill it. Any other
1500 	 * error (busy), just add it to our list as we previously
1501 	 * would have done
1502 	 */
1503 	ret = q->mq_ops->queue_rq(hctx, &bd);
1504 	switch (ret) {
1505 	case BLK_STS_OK:
1506 		*cookie = new_cookie;
1507 		return;
1508 	case BLK_STS_RESOURCE:
1509 		__blk_mq_requeue_request(rq);
1510 		goto insert;
1511 	default:
1512 		*cookie = BLK_QC_T_NONE;
1513 		blk_mq_end_request(rq, ret);
1514 		return;
1515 	}
1516 
1517 insert:
1518 	blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
1519 }
1520 
1521 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1522 		struct request *rq, blk_qc_t *cookie)
1523 {
1524 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1525 		rcu_read_lock();
1526 		__blk_mq_try_issue_directly(hctx, rq, cookie, false);
1527 		rcu_read_unlock();
1528 	} else {
1529 		unsigned int srcu_idx;
1530 
1531 		might_sleep();
1532 
1533 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
1534 		__blk_mq_try_issue_directly(hctx, rq, cookie, true);
1535 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
1536 	}
1537 }
1538 
1539 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1540 {
1541 	const int is_sync = op_is_sync(bio->bi_opf);
1542 	const int is_flush_fua = op_is_flush(bio->bi_opf);
1543 	struct blk_mq_alloc_data data = { .flags = 0 };
1544 	struct request *rq;
1545 	unsigned int request_count = 0;
1546 	struct blk_plug *plug;
1547 	struct request *same_queue_rq = NULL;
1548 	blk_qc_t cookie;
1549 	unsigned int wb_acct;
1550 
1551 	blk_queue_bounce(q, &bio);
1552 
1553 	blk_queue_split(q, &bio);
1554 
1555 	if (!bio_integrity_prep(bio))
1556 		return BLK_QC_T_NONE;
1557 
1558 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
1559 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1560 		return BLK_QC_T_NONE;
1561 
1562 	if (blk_mq_sched_bio_merge(q, bio))
1563 		return BLK_QC_T_NONE;
1564 
1565 	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
1566 
1567 	trace_block_getrq(q, bio, bio->bi_opf);
1568 
1569 	rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
1570 	if (unlikely(!rq)) {
1571 		__wbt_done(q->rq_wb, wb_acct);
1572 		if (bio->bi_opf & REQ_NOWAIT)
1573 			bio_wouldblock_error(bio);
1574 		return BLK_QC_T_NONE;
1575 	}
1576 
1577 	wbt_track(&rq->issue_stat, wb_acct);
1578 
1579 	cookie = request_to_qc_t(data.hctx, rq);
1580 
1581 	plug = current->plug;
1582 	if (unlikely(is_flush_fua)) {
1583 		blk_mq_put_ctx(data.ctx);
1584 		blk_mq_bio_to_request(rq, bio);
1585 		if (q->elevator) {
1586 			blk_mq_sched_insert_request(rq, false, true, true,
1587 					true);
1588 		} else {
1589 			blk_insert_flush(rq);
1590 			blk_mq_run_hw_queue(data.hctx, true);
1591 		}
1592 	} else if (plug && q->nr_hw_queues == 1) {
1593 		struct request *last = NULL;
1594 
1595 		blk_mq_put_ctx(data.ctx);
1596 		blk_mq_bio_to_request(rq, bio);
1597 
1598 		/*
1599 		 * @request_count may become stale because of schedule
1600 		 * out, so check the list again.
1601 		 */
1602 		if (list_empty(&plug->mq_list))
1603 			request_count = 0;
1604 		else if (blk_queue_nomerges(q))
1605 			request_count = blk_plug_queued_count(q);
1606 
1607 		if (!request_count)
1608 			trace_block_plug(q);
1609 		else
1610 			last = list_entry_rq(plug->mq_list.prev);
1611 
1612 		if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1613 		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1614 			blk_flush_plug_list(plug, false);
1615 			trace_block_plug(q);
1616 		}
1617 
1618 		list_add_tail(&rq->queuelist, &plug->mq_list);
1619 	} else if (plug && !blk_queue_nomerges(q)) {
1620 		blk_mq_bio_to_request(rq, bio);
1621 
1622 		/*
1623 		 * We do limited plugging. If the bio can be merged, do that.
1624 		 * Otherwise the existing request in the plug list will be
1625 		 * issued. So the plug list will have one request at most
1626 		 * The plug list might get flushed before this. If that happens,
1627 		 * the plug list is empty, and same_queue_rq is invalid.
1628 		 */
1629 		if (list_empty(&plug->mq_list))
1630 			same_queue_rq = NULL;
1631 		if (same_queue_rq)
1632 			list_del_init(&same_queue_rq->queuelist);
1633 		list_add_tail(&rq->queuelist, &plug->mq_list);
1634 
1635 		blk_mq_put_ctx(data.ctx);
1636 
1637 		if (same_queue_rq) {
1638 			data.hctx = blk_mq_map_queue(q,
1639 					same_queue_rq->mq_ctx->cpu);
1640 			blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1641 					&cookie);
1642 		}
1643 	} else if (q->nr_hw_queues > 1 && is_sync) {
1644 		blk_mq_put_ctx(data.ctx);
1645 		blk_mq_bio_to_request(rq, bio);
1646 		blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1647 	} else if (q->elevator) {
1648 		blk_mq_put_ctx(data.ctx);
1649 		blk_mq_bio_to_request(rq, bio);
1650 		blk_mq_sched_insert_request(rq, false, true, true, true);
1651 	} else {
1652 		blk_mq_put_ctx(data.ctx);
1653 		blk_mq_bio_to_request(rq, bio);
1654 		blk_mq_queue_io(data.hctx, data.ctx, rq);
1655 		blk_mq_run_hw_queue(data.hctx, true);
1656 	}
1657 
1658 	return cookie;
1659 }
1660 
1661 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1662 		     unsigned int hctx_idx)
1663 {
1664 	struct page *page;
1665 
1666 	if (tags->rqs && set->ops->exit_request) {
1667 		int i;
1668 
1669 		for (i = 0; i < tags->nr_tags; i++) {
1670 			struct request *rq = tags->static_rqs[i];
1671 
1672 			if (!rq)
1673 				continue;
1674 			set->ops->exit_request(set, rq, hctx_idx);
1675 			tags->static_rqs[i] = NULL;
1676 		}
1677 	}
1678 
1679 	while (!list_empty(&tags->page_list)) {
1680 		page = list_first_entry(&tags->page_list, struct page, lru);
1681 		list_del_init(&page->lru);
1682 		/*
1683 		 * Remove kmemleak object previously allocated in
1684 		 * blk_mq_init_rq_map().
1685 		 */
1686 		kmemleak_free(page_address(page));
1687 		__free_pages(page, page->private);
1688 	}
1689 }
1690 
1691 void blk_mq_free_rq_map(struct blk_mq_tags *tags)
1692 {
1693 	kfree(tags->rqs);
1694 	tags->rqs = NULL;
1695 	kfree(tags->static_rqs);
1696 	tags->static_rqs = NULL;
1697 
1698 	blk_mq_free_tags(tags);
1699 }
1700 
1701 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
1702 					unsigned int hctx_idx,
1703 					unsigned int nr_tags,
1704 					unsigned int reserved_tags)
1705 {
1706 	struct blk_mq_tags *tags;
1707 	int node;
1708 
1709 	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
1710 	if (node == NUMA_NO_NODE)
1711 		node = set->numa_node;
1712 
1713 	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
1714 				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1715 	if (!tags)
1716 		return NULL;
1717 
1718 	tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1719 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1720 				 node);
1721 	if (!tags->rqs) {
1722 		blk_mq_free_tags(tags);
1723 		return NULL;
1724 	}
1725 
1726 	tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1727 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1728 				 node);
1729 	if (!tags->static_rqs) {
1730 		kfree(tags->rqs);
1731 		blk_mq_free_tags(tags);
1732 		return NULL;
1733 	}
1734 
1735 	return tags;
1736 }
1737 
1738 static size_t order_to_size(unsigned int order)
1739 {
1740 	return (size_t)PAGE_SIZE << order;
1741 }
1742 
1743 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1744 		     unsigned int hctx_idx, unsigned int depth)
1745 {
1746 	unsigned int i, j, entries_per_page, max_order = 4;
1747 	size_t rq_size, left;
1748 	int node;
1749 
1750 	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
1751 	if (node == NUMA_NO_NODE)
1752 		node = set->numa_node;
1753 
1754 	INIT_LIST_HEAD(&tags->page_list);
1755 
1756 	/*
1757 	 * rq_size is the size of the request plus driver payload, rounded
1758 	 * to the cacheline size
1759 	 */
1760 	rq_size = round_up(sizeof(struct request) + set->cmd_size,
1761 				cache_line_size());
1762 	left = rq_size * depth;
1763 
1764 	for (i = 0; i < depth; ) {
1765 		int this_order = max_order;
1766 		struct page *page;
1767 		int to_do;
1768 		void *p;
1769 
1770 		while (this_order && left < order_to_size(this_order - 1))
1771 			this_order--;
1772 
1773 		do {
1774 			page = alloc_pages_node(node,
1775 				GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1776 				this_order);
1777 			if (page)
1778 				break;
1779 			if (!this_order--)
1780 				break;
1781 			if (order_to_size(this_order) < rq_size)
1782 				break;
1783 		} while (1);
1784 
1785 		if (!page)
1786 			goto fail;
1787 
1788 		page->private = this_order;
1789 		list_add_tail(&page->lru, &tags->page_list);
1790 
1791 		p = page_address(page);
1792 		/*
1793 		 * Allow kmemleak to scan these pages as they contain pointers
1794 		 * to additional allocations like via ops->init_request().
1795 		 */
1796 		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
1797 		entries_per_page = order_to_size(this_order) / rq_size;
1798 		to_do = min(entries_per_page, depth - i);
1799 		left -= to_do * rq_size;
1800 		for (j = 0; j < to_do; j++) {
1801 			struct request *rq = p;
1802 
1803 			tags->static_rqs[i] = rq;
1804 			if (set->ops->init_request) {
1805 				if (set->ops->init_request(set, rq, hctx_idx,
1806 						node)) {
1807 					tags->static_rqs[i] = NULL;
1808 					goto fail;
1809 				}
1810 			}
1811 
1812 			p += rq_size;
1813 			i++;
1814 		}
1815 	}
1816 	return 0;
1817 
1818 fail:
1819 	blk_mq_free_rqs(set, tags, hctx_idx);
1820 	return -ENOMEM;
1821 }
1822 
1823 /*
1824  * 'cpu' is going away. splice any existing rq_list entries from this
1825  * software queue to the hw queue dispatch list, and ensure that it
1826  * gets run.
1827  */
1828 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
1829 {
1830 	struct blk_mq_hw_ctx *hctx;
1831 	struct blk_mq_ctx *ctx;
1832 	LIST_HEAD(tmp);
1833 
1834 	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
1835 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1836 
1837 	spin_lock(&ctx->lock);
1838 	if (!list_empty(&ctx->rq_list)) {
1839 		list_splice_init(&ctx->rq_list, &tmp);
1840 		blk_mq_hctx_clear_pending(hctx, ctx);
1841 	}
1842 	spin_unlock(&ctx->lock);
1843 
1844 	if (list_empty(&tmp))
1845 		return 0;
1846 
1847 	spin_lock(&hctx->lock);
1848 	list_splice_tail_init(&tmp, &hctx->dispatch);
1849 	spin_unlock(&hctx->lock);
1850 
1851 	blk_mq_run_hw_queue(hctx, true);
1852 	return 0;
1853 }
1854 
1855 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
1856 {
1857 	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
1858 					    &hctx->cpuhp_dead);
1859 }
1860 
1861 /* hctx->ctxs will be freed in queue's release handler */
1862 static void blk_mq_exit_hctx(struct request_queue *q,
1863 		struct blk_mq_tag_set *set,
1864 		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1865 {
1866 	blk_mq_debugfs_unregister_hctx(hctx);
1867 
1868 	blk_mq_tag_idle(hctx);
1869 
1870 	if (set->ops->exit_request)
1871 		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
1872 
1873 	blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1874 
1875 	if (set->ops->exit_hctx)
1876 		set->ops->exit_hctx(hctx, hctx_idx);
1877 
1878 	if (hctx->flags & BLK_MQ_F_BLOCKING)
1879 		cleanup_srcu_struct(hctx->queue_rq_srcu);
1880 
1881 	blk_mq_remove_cpuhp(hctx);
1882 	blk_free_flush_queue(hctx->fq);
1883 	sbitmap_free(&hctx->ctx_map);
1884 }
1885 
1886 static void blk_mq_exit_hw_queues(struct request_queue *q,
1887 		struct blk_mq_tag_set *set, int nr_queue)
1888 {
1889 	struct blk_mq_hw_ctx *hctx;
1890 	unsigned int i;
1891 
1892 	queue_for_each_hw_ctx(q, hctx, i) {
1893 		if (i == nr_queue)
1894 			break;
1895 		blk_mq_exit_hctx(q, set, hctx, i);
1896 	}
1897 }
1898 
1899 static int blk_mq_init_hctx(struct request_queue *q,
1900 		struct blk_mq_tag_set *set,
1901 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1902 {
1903 	int node;
1904 
1905 	node = hctx->numa_node;
1906 	if (node == NUMA_NO_NODE)
1907 		node = hctx->numa_node = set->numa_node;
1908 
1909 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1910 	spin_lock_init(&hctx->lock);
1911 	INIT_LIST_HEAD(&hctx->dispatch);
1912 	hctx->queue = q;
1913 	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
1914 
1915 	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
1916 
1917 	hctx->tags = set->tags[hctx_idx];
1918 
1919 	/*
1920 	 * Allocate space for all possible cpus to avoid allocation at
1921 	 * runtime
1922 	 */
1923 	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1924 					GFP_KERNEL, node);
1925 	if (!hctx->ctxs)
1926 		goto unregister_cpu_notifier;
1927 
1928 	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
1929 			      node))
1930 		goto free_ctxs;
1931 
1932 	hctx->nr_ctx = 0;
1933 
1934 	if (set->ops->init_hctx &&
1935 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1936 		goto free_bitmap;
1937 
1938 	if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
1939 		goto exit_hctx;
1940 
1941 	hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1942 	if (!hctx->fq)
1943 		goto sched_exit_hctx;
1944 
1945 	if (set->ops->init_request &&
1946 	    set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
1947 				   node))
1948 		goto free_fq;
1949 
1950 	if (hctx->flags & BLK_MQ_F_BLOCKING)
1951 		init_srcu_struct(hctx->queue_rq_srcu);
1952 
1953 	blk_mq_debugfs_register_hctx(q, hctx);
1954 
1955 	return 0;
1956 
1957  free_fq:
1958 	kfree(hctx->fq);
1959  sched_exit_hctx:
1960 	blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1961  exit_hctx:
1962 	if (set->ops->exit_hctx)
1963 		set->ops->exit_hctx(hctx, hctx_idx);
1964  free_bitmap:
1965 	sbitmap_free(&hctx->ctx_map);
1966  free_ctxs:
1967 	kfree(hctx->ctxs);
1968  unregister_cpu_notifier:
1969 	blk_mq_remove_cpuhp(hctx);
1970 	return -1;
1971 }
1972 
1973 static void blk_mq_init_cpu_queues(struct request_queue *q,
1974 				   unsigned int nr_hw_queues)
1975 {
1976 	unsigned int i;
1977 
1978 	for_each_possible_cpu(i) {
1979 		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1980 		struct blk_mq_hw_ctx *hctx;
1981 
1982 		__ctx->cpu = i;
1983 		spin_lock_init(&__ctx->lock);
1984 		INIT_LIST_HEAD(&__ctx->rq_list);
1985 		__ctx->queue = q;
1986 
1987 		/* If the cpu isn't present, the cpu is mapped to first hctx */
1988 		if (!cpu_present(i))
1989 			continue;
1990 
1991 		hctx = blk_mq_map_queue(q, i);
1992 
1993 		/*
1994 		 * Set local node, IFF we have more than one hw queue. If
1995 		 * not, we remain on the home node of the device
1996 		 */
1997 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1998 			hctx->numa_node = local_memory_node(cpu_to_node(i));
1999 	}
2000 }
2001 
2002 static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2003 {
2004 	int ret = 0;
2005 
2006 	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2007 					set->queue_depth, set->reserved_tags);
2008 	if (!set->tags[hctx_idx])
2009 		return false;
2010 
2011 	ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2012 				set->queue_depth);
2013 	if (!ret)
2014 		return true;
2015 
2016 	blk_mq_free_rq_map(set->tags[hctx_idx]);
2017 	set->tags[hctx_idx] = NULL;
2018 	return false;
2019 }
2020 
2021 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2022 					 unsigned int hctx_idx)
2023 {
2024 	if (set->tags[hctx_idx]) {
2025 		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2026 		blk_mq_free_rq_map(set->tags[hctx_idx]);
2027 		set->tags[hctx_idx] = NULL;
2028 	}
2029 }
2030 
2031 static void blk_mq_map_swqueue(struct request_queue *q)
2032 {
2033 	unsigned int i, hctx_idx;
2034 	struct blk_mq_hw_ctx *hctx;
2035 	struct blk_mq_ctx *ctx;
2036 	struct blk_mq_tag_set *set = q->tag_set;
2037 
2038 	/*
2039 	 * Avoid others reading imcomplete hctx->cpumask through sysfs
2040 	 */
2041 	mutex_lock(&q->sysfs_lock);
2042 
2043 	queue_for_each_hw_ctx(q, hctx, i) {
2044 		cpumask_clear(hctx->cpumask);
2045 		hctx->nr_ctx = 0;
2046 	}
2047 
2048 	/*
2049 	 * Map software to hardware queues.
2050 	 *
2051 	 * If the cpu isn't present, the cpu is mapped to first hctx.
2052 	 */
2053 	for_each_present_cpu(i) {
2054 		hctx_idx = q->mq_map[i];
2055 		/* unmapped hw queue can be remapped after CPU topo changed */
2056 		if (!set->tags[hctx_idx] &&
2057 		    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2058 			/*
2059 			 * If tags initialization fail for some hctx,
2060 			 * that hctx won't be brought online.  In this
2061 			 * case, remap the current ctx to hctx[0] which
2062 			 * is guaranteed to always have tags allocated
2063 			 */
2064 			q->mq_map[i] = 0;
2065 		}
2066 
2067 		ctx = per_cpu_ptr(q->queue_ctx, i);
2068 		hctx = blk_mq_map_queue(q, i);
2069 
2070 		cpumask_set_cpu(i, hctx->cpumask);
2071 		ctx->index_hw = hctx->nr_ctx;
2072 		hctx->ctxs[hctx->nr_ctx++] = ctx;
2073 	}
2074 
2075 	mutex_unlock(&q->sysfs_lock);
2076 
2077 	queue_for_each_hw_ctx(q, hctx, i) {
2078 		/*
2079 		 * If no software queues are mapped to this hardware queue,
2080 		 * disable it and free the request entries.
2081 		 */
2082 		if (!hctx->nr_ctx) {
2083 			/* Never unmap queue 0.  We need it as a
2084 			 * fallback in case of a new remap fails
2085 			 * allocation
2086 			 */
2087 			if (i && set->tags[i])
2088 				blk_mq_free_map_and_requests(set, i);
2089 
2090 			hctx->tags = NULL;
2091 			continue;
2092 		}
2093 
2094 		hctx->tags = set->tags[i];
2095 		WARN_ON(!hctx->tags);
2096 
2097 		/*
2098 		 * Set the map size to the number of mapped software queues.
2099 		 * This is more accurate and more efficient than looping
2100 		 * over all possibly mapped software queues.
2101 		 */
2102 		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2103 
2104 		/*
2105 		 * Initialize batch roundrobin counts
2106 		 */
2107 		hctx->next_cpu = cpumask_first(hctx->cpumask);
2108 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2109 	}
2110 }
2111 
2112 /*
2113  * Caller needs to ensure that we're either frozen/quiesced, or that
2114  * the queue isn't live yet.
2115  */
2116 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2117 {
2118 	struct blk_mq_hw_ctx *hctx;
2119 	int i;
2120 
2121 	queue_for_each_hw_ctx(q, hctx, i) {
2122 		if (shared) {
2123 			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2124 				atomic_inc(&q->shared_hctx_restart);
2125 			hctx->flags |= BLK_MQ_F_TAG_SHARED;
2126 		} else {
2127 			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2128 				atomic_dec(&q->shared_hctx_restart);
2129 			hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2130 		}
2131 	}
2132 }
2133 
2134 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2135 					bool shared)
2136 {
2137 	struct request_queue *q;
2138 
2139 	lockdep_assert_held(&set->tag_list_lock);
2140 
2141 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
2142 		blk_mq_freeze_queue(q);
2143 		queue_set_hctx_shared(q, shared);
2144 		blk_mq_unfreeze_queue(q);
2145 	}
2146 }
2147 
2148 static void blk_mq_del_queue_tag_set(struct request_queue *q)
2149 {
2150 	struct blk_mq_tag_set *set = q->tag_set;
2151 
2152 	mutex_lock(&set->tag_list_lock);
2153 	list_del_rcu(&q->tag_set_list);
2154 	INIT_LIST_HEAD(&q->tag_set_list);
2155 	if (list_is_singular(&set->tag_list)) {
2156 		/* just transitioned to unshared */
2157 		set->flags &= ~BLK_MQ_F_TAG_SHARED;
2158 		/* update existing queue */
2159 		blk_mq_update_tag_set_depth(set, false);
2160 	}
2161 	mutex_unlock(&set->tag_list_lock);
2162 
2163 	synchronize_rcu();
2164 }
2165 
2166 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2167 				     struct request_queue *q)
2168 {
2169 	q->tag_set = set;
2170 
2171 	mutex_lock(&set->tag_list_lock);
2172 
2173 	/* Check to see if we're transitioning to shared (from 1 to 2 queues). */
2174 	if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2175 		set->flags |= BLK_MQ_F_TAG_SHARED;
2176 		/* update existing queue */
2177 		blk_mq_update_tag_set_depth(set, true);
2178 	}
2179 	if (set->flags & BLK_MQ_F_TAG_SHARED)
2180 		queue_set_hctx_shared(q, true);
2181 	list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2182 
2183 	mutex_unlock(&set->tag_list_lock);
2184 }
2185 
2186 /*
2187  * It is the actual release handler for mq, but we do it from
2188  * request queue's release handler for avoiding use-after-free
2189  * and headache because q->mq_kobj shouldn't have been introduced,
2190  * but we can't group ctx/kctx kobj without it.
2191  */
2192 void blk_mq_release(struct request_queue *q)
2193 {
2194 	struct blk_mq_hw_ctx *hctx;
2195 	unsigned int i;
2196 
2197 	/* hctx kobj stays in hctx */
2198 	queue_for_each_hw_ctx(q, hctx, i) {
2199 		if (!hctx)
2200 			continue;
2201 		kobject_put(&hctx->kobj);
2202 	}
2203 
2204 	q->mq_map = NULL;
2205 
2206 	kfree(q->queue_hw_ctx);
2207 
2208 	/*
2209 	 * release .mq_kobj and sw queue's kobject now because
2210 	 * both share lifetime with request queue.
2211 	 */
2212 	blk_mq_sysfs_deinit(q);
2213 
2214 	free_percpu(q->queue_ctx);
2215 }
2216 
2217 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2218 {
2219 	struct request_queue *uninit_q, *q;
2220 
2221 	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
2222 	if (!uninit_q)
2223 		return ERR_PTR(-ENOMEM);
2224 
2225 	q = blk_mq_init_allocated_queue(set, uninit_q);
2226 	if (IS_ERR(q))
2227 		blk_cleanup_queue(uninit_q);
2228 
2229 	return q;
2230 }
2231 EXPORT_SYMBOL(blk_mq_init_queue);
2232 
2233 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2234 {
2235 	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2236 
2237 	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
2238 			   __alignof__(struct blk_mq_hw_ctx)) !=
2239 		     sizeof(struct blk_mq_hw_ctx));
2240 
2241 	if (tag_set->flags & BLK_MQ_F_BLOCKING)
2242 		hw_ctx_size += sizeof(struct srcu_struct);
2243 
2244 	return hw_ctx_size;
2245 }
2246 
2247 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2248 						struct request_queue *q)
2249 {
2250 	int i, j;
2251 	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2252 
2253 	blk_mq_sysfs_unregister(q);
2254 	for (i = 0; i < set->nr_hw_queues; i++) {
2255 		int node;
2256 
2257 		if (hctxs[i])
2258 			continue;
2259 
2260 		node = blk_mq_hw_queue_to_node(q->mq_map, i);
2261 		hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2262 					GFP_KERNEL, node);
2263 		if (!hctxs[i])
2264 			break;
2265 
2266 		if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
2267 						node)) {
2268 			kfree(hctxs[i]);
2269 			hctxs[i] = NULL;
2270 			break;
2271 		}
2272 
2273 		atomic_set(&hctxs[i]->nr_active, 0);
2274 		hctxs[i]->numa_node = node;
2275 		hctxs[i]->queue_num = i;
2276 
2277 		if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2278 			free_cpumask_var(hctxs[i]->cpumask);
2279 			kfree(hctxs[i]);
2280 			hctxs[i] = NULL;
2281 			break;
2282 		}
2283 		blk_mq_hctx_kobj_init(hctxs[i]);
2284 	}
2285 	for (j = i; j < q->nr_hw_queues; j++) {
2286 		struct blk_mq_hw_ctx *hctx = hctxs[j];
2287 
2288 		if (hctx) {
2289 			if (hctx->tags)
2290 				blk_mq_free_map_and_requests(set, j);
2291 			blk_mq_exit_hctx(q, set, hctx, j);
2292 			kobject_put(&hctx->kobj);
2293 			hctxs[j] = NULL;
2294 
2295 		}
2296 	}
2297 	q->nr_hw_queues = i;
2298 	blk_mq_sysfs_register(q);
2299 }
2300 
2301 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2302 						  struct request_queue *q)
2303 {
2304 	/* mark the queue as mq asap */
2305 	q->mq_ops = set->ops;
2306 
2307 	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2308 					     blk_mq_poll_stats_bkt,
2309 					     BLK_MQ_POLL_STATS_BKTS, q);
2310 	if (!q->poll_cb)
2311 		goto err_exit;
2312 
2313 	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2314 	if (!q->queue_ctx)
2315 		goto err_exit;
2316 
2317 	/* init q->mq_kobj and sw queues' kobjects */
2318 	blk_mq_sysfs_init(q);
2319 
2320 	q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2321 						GFP_KERNEL, set->numa_node);
2322 	if (!q->queue_hw_ctx)
2323 		goto err_percpu;
2324 
2325 	q->mq_map = set->mq_map;
2326 
2327 	blk_mq_realloc_hw_ctxs(set, q);
2328 	if (!q->nr_hw_queues)
2329 		goto err_hctxs;
2330 
2331 	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2332 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2333 
2334 	q->nr_queues = nr_cpu_ids;
2335 
2336 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2337 
2338 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
2339 		q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2340 
2341 	q->sg_reserved_size = INT_MAX;
2342 
2343 	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2344 	INIT_LIST_HEAD(&q->requeue_list);
2345 	spin_lock_init(&q->requeue_lock);
2346 
2347 	blk_queue_make_request(q, blk_mq_make_request);
2348 
2349 	/*
2350 	 * Do this after blk_queue_make_request() overrides it...
2351 	 */
2352 	q->nr_requests = set->queue_depth;
2353 
2354 	/*
2355 	 * Default to classic polling
2356 	 */
2357 	q->poll_nsec = -1;
2358 
2359 	if (set->ops->complete)
2360 		blk_queue_softirq_done(q, set->ops->complete);
2361 
2362 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2363 	blk_mq_add_queue_tag_set(set, q);
2364 	blk_mq_map_swqueue(q);
2365 
2366 	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2367 		int ret;
2368 
2369 		ret = blk_mq_sched_init(q);
2370 		if (ret)
2371 			return ERR_PTR(ret);
2372 	}
2373 
2374 	return q;
2375 
2376 err_hctxs:
2377 	kfree(q->queue_hw_ctx);
2378 err_percpu:
2379 	free_percpu(q->queue_ctx);
2380 err_exit:
2381 	q->mq_ops = NULL;
2382 	return ERR_PTR(-ENOMEM);
2383 }
2384 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2385 
2386 void blk_mq_free_queue(struct request_queue *q)
2387 {
2388 	struct blk_mq_tag_set	*set = q->tag_set;
2389 
2390 	blk_mq_del_queue_tag_set(q);
2391 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2392 }
2393 
2394 /* Basically redo blk_mq_init_queue with queue frozen */
2395 static void blk_mq_queue_reinit(struct request_queue *q)
2396 {
2397 	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2398 
2399 	blk_mq_debugfs_unregister_hctxs(q);
2400 	blk_mq_sysfs_unregister(q);
2401 
2402 	/*
2403 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2404 	 * we should change hctx numa_node according to new topology (this
2405 	 * involves free and re-allocate memory, worthy doing?)
2406 	 */
2407 
2408 	blk_mq_map_swqueue(q);
2409 
2410 	blk_mq_sysfs_register(q);
2411 	blk_mq_debugfs_register_hctxs(q);
2412 }
2413 
2414 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2415 {
2416 	int i;
2417 
2418 	for (i = 0; i < set->nr_hw_queues; i++)
2419 		if (!__blk_mq_alloc_rq_map(set, i))
2420 			goto out_unwind;
2421 
2422 	return 0;
2423 
2424 out_unwind:
2425 	while (--i >= 0)
2426 		blk_mq_free_rq_map(set->tags[i]);
2427 
2428 	return -ENOMEM;
2429 }
2430 
2431 /*
2432  * Allocate the request maps associated with this tag_set. Note that this
2433  * may reduce the depth asked for, if memory is tight. set->queue_depth
2434  * will be updated to reflect the allocated depth.
2435  */
2436 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2437 {
2438 	unsigned int depth;
2439 	int err;
2440 
2441 	depth = set->queue_depth;
2442 	do {
2443 		err = __blk_mq_alloc_rq_maps(set);
2444 		if (!err)
2445 			break;
2446 
2447 		set->queue_depth >>= 1;
2448 		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2449 			err = -ENOMEM;
2450 			break;
2451 		}
2452 	} while (set->queue_depth);
2453 
2454 	if (!set->queue_depth || err) {
2455 		pr_err("blk-mq: failed to allocate request map\n");
2456 		return -ENOMEM;
2457 	}
2458 
2459 	if (depth != set->queue_depth)
2460 		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2461 						depth, set->queue_depth);
2462 
2463 	return 0;
2464 }
2465 
2466 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2467 {
2468 	if (set->ops->map_queues)
2469 		return set->ops->map_queues(set);
2470 	else
2471 		return blk_mq_map_queues(set);
2472 }
2473 
2474 /*
2475  * Alloc a tag set to be associated with one or more request queues.
2476  * May fail with EINVAL for various error conditions. May adjust the
2477  * requested depth down, if if it too large. In that case, the set
2478  * value will be stored in set->queue_depth.
2479  */
2480 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2481 {
2482 	int ret;
2483 
2484 	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2485 
2486 	if (!set->nr_hw_queues)
2487 		return -EINVAL;
2488 	if (!set->queue_depth)
2489 		return -EINVAL;
2490 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2491 		return -EINVAL;
2492 
2493 	if (!set->ops->queue_rq)
2494 		return -EINVAL;
2495 
2496 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2497 		pr_info("blk-mq: reduced tag depth to %u\n",
2498 			BLK_MQ_MAX_DEPTH);
2499 		set->queue_depth = BLK_MQ_MAX_DEPTH;
2500 	}
2501 
2502 	/*
2503 	 * If a crashdump is active, then we are potentially in a very
2504 	 * memory constrained environment. Limit us to 1 queue and
2505 	 * 64 tags to prevent using too much memory.
2506 	 */
2507 	if (is_kdump_kernel()) {
2508 		set->nr_hw_queues = 1;
2509 		set->queue_depth = min(64U, set->queue_depth);
2510 	}
2511 	/*
2512 	 * There is no use for more h/w queues than cpus.
2513 	 */
2514 	if (set->nr_hw_queues > nr_cpu_ids)
2515 		set->nr_hw_queues = nr_cpu_ids;
2516 
2517 	set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
2518 				 GFP_KERNEL, set->numa_node);
2519 	if (!set->tags)
2520 		return -ENOMEM;
2521 
2522 	ret = -ENOMEM;
2523 	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
2524 			GFP_KERNEL, set->numa_node);
2525 	if (!set->mq_map)
2526 		goto out_free_tags;
2527 
2528 	ret = blk_mq_update_queue_map(set);
2529 	if (ret)
2530 		goto out_free_mq_map;
2531 
2532 	ret = blk_mq_alloc_rq_maps(set);
2533 	if (ret)
2534 		goto out_free_mq_map;
2535 
2536 	mutex_init(&set->tag_list_lock);
2537 	INIT_LIST_HEAD(&set->tag_list);
2538 
2539 	return 0;
2540 
2541 out_free_mq_map:
2542 	kfree(set->mq_map);
2543 	set->mq_map = NULL;
2544 out_free_tags:
2545 	kfree(set->tags);
2546 	set->tags = NULL;
2547 	return ret;
2548 }
2549 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2550 
2551 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2552 {
2553 	int i;
2554 
2555 	for (i = 0; i < nr_cpu_ids; i++)
2556 		blk_mq_free_map_and_requests(set, i);
2557 
2558 	kfree(set->mq_map);
2559 	set->mq_map = NULL;
2560 
2561 	kfree(set->tags);
2562 	set->tags = NULL;
2563 }
2564 EXPORT_SYMBOL(blk_mq_free_tag_set);
2565 
2566 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2567 {
2568 	struct blk_mq_tag_set *set = q->tag_set;
2569 	struct blk_mq_hw_ctx *hctx;
2570 	int i, ret;
2571 
2572 	if (!set)
2573 		return -EINVAL;
2574 
2575 	blk_mq_freeze_queue(q);
2576 
2577 	ret = 0;
2578 	queue_for_each_hw_ctx(q, hctx, i) {
2579 		if (!hctx->tags)
2580 			continue;
2581 		/*
2582 		 * If we're using an MQ scheduler, just update the scheduler
2583 		 * queue depth. This is similar to what the old code would do.
2584 		 */
2585 		if (!hctx->sched_tags) {
2586 			ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
2587 							min(nr, set->queue_depth),
2588 							false);
2589 		} else {
2590 			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2591 							nr, true);
2592 		}
2593 		if (ret)
2594 			break;
2595 	}
2596 
2597 	if (!ret)
2598 		q->nr_requests = nr;
2599 
2600 	blk_mq_unfreeze_queue(q);
2601 
2602 	return ret;
2603 }
2604 
2605 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
2606 							int nr_hw_queues)
2607 {
2608 	struct request_queue *q;
2609 
2610 	lockdep_assert_held(&set->tag_list_lock);
2611 
2612 	if (nr_hw_queues > nr_cpu_ids)
2613 		nr_hw_queues = nr_cpu_ids;
2614 	if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2615 		return;
2616 
2617 	list_for_each_entry(q, &set->tag_list, tag_set_list)
2618 		blk_mq_freeze_queue(q);
2619 
2620 	set->nr_hw_queues = nr_hw_queues;
2621 	blk_mq_update_queue_map(set);
2622 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
2623 		blk_mq_realloc_hw_ctxs(set, q);
2624 		blk_mq_queue_reinit(q);
2625 	}
2626 
2627 	list_for_each_entry(q, &set->tag_list, tag_set_list)
2628 		blk_mq_unfreeze_queue(q);
2629 }
2630 
2631 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2632 {
2633 	mutex_lock(&set->tag_list_lock);
2634 	__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
2635 	mutex_unlock(&set->tag_list_lock);
2636 }
2637 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2638 
2639 /* Enable polling stats and return whether they were already enabled. */
2640 static bool blk_poll_stats_enable(struct request_queue *q)
2641 {
2642 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2643 	    test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
2644 		return true;
2645 	blk_stat_add_callback(q, q->poll_cb);
2646 	return false;
2647 }
2648 
2649 static void blk_mq_poll_stats_start(struct request_queue *q)
2650 {
2651 	/*
2652 	 * We don't arm the callback if polling stats are not enabled or the
2653 	 * callback is already active.
2654 	 */
2655 	if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2656 	    blk_stat_is_active(q->poll_cb))
2657 		return;
2658 
2659 	blk_stat_activate_msecs(q->poll_cb, 100);
2660 }
2661 
2662 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
2663 {
2664 	struct request_queue *q = cb->data;
2665 	int bucket;
2666 
2667 	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
2668 		if (cb->stat[bucket].nr_samples)
2669 			q->poll_stat[bucket] = cb->stat[bucket];
2670 	}
2671 }
2672 
2673 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
2674 				       struct blk_mq_hw_ctx *hctx,
2675 				       struct request *rq)
2676 {
2677 	unsigned long ret = 0;
2678 	int bucket;
2679 
2680 	/*
2681 	 * If stats collection isn't on, don't sleep but turn it on for
2682 	 * future users
2683 	 */
2684 	if (!blk_poll_stats_enable(q))
2685 		return 0;
2686 
2687 	/*
2688 	 * As an optimistic guess, use half of the mean service time
2689 	 * for this type of request. We can (and should) make this smarter.
2690 	 * For instance, if the completion latencies are tight, we can
2691 	 * get closer than just half the mean. This is especially
2692 	 * important on devices where the completion latencies are longer
2693 	 * than ~10 usec. We do use the stats for the relevant IO size
2694 	 * if available which does lead to better estimates.
2695 	 */
2696 	bucket = blk_mq_poll_stats_bkt(rq);
2697 	if (bucket < 0)
2698 		return ret;
2699 
2700 	if (q->poll_stat[bucket].nr_samples)
2701 		ret = (q->poll_stat[bucket].mean + 1) / 2;
2702 
2703 	return ret;
2704 }
2705 
2706 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
2707 				     struct blk_mq_hw_ctx *hctx,
2708 				     struct request *rq)
2709 {
2710 	struct hrtimer_sleeper hs;
2711 	enum hrtimer_mode mode;
2712 	unsigned int nsecs;
2713 	ktime_t kt;
2714 
2715 	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
2716 		return false;
2717 
2718 	/*
2719 	 * poll_nsec can be:
2720 	 *
2721 	 * -1:	don't ever hybrid sleep
2722 	 *  0:	use half of prev avg
2723 	 * >0:	use this specific value
2724 	 */
2725 	if (q->poll_nsec == -1)
2726 		return false;
2727 	else if (q->poll_nsec > 0)
2728 		nsecs = q->poll_nsec;
2729 	else
2730 		nsecs = blk_mq_poll_nsecs(q, hctx, rq);
2731 
2732 	if (!nsecs)
2733 		return false;
2734 
2735 	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
2736 
2737 	/*
2738 	 * This will be replaced with the stats tracking code, using
2739 	 * 'avg_completion_time / 2' as the pre-sleep target.
2740 	 */
2741 	kt = nsecs;
2742 
2743 	mode = HRTIMER_MODE_REL;
2744 	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
2745 	hrtimer_set_expires(&hs.timer, kt);
2746 
2747 	hrtimer_init_sleeper(&hs, current);
2748 	do {
2749 		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
2750 			break;
2751 		set_current_state(TASK_UNINTERRUPTIBLE);
2752 		hrtimer_start_expires(&hs.timer, mode);
2753 		if (hs.task)
2754 			io_schedule();
2755 		hrtimer_cancel(&hs.timer);
2756 		mode = HRTIMER_MODE_ABS;
2757 	} while (hs.task && !signal_pending(current));
2758 
2759 	__set_current_state(TASK_RUNNING);
2760 	destroy_hrtimer_on_stack(&hs.timer);
2761 	return true;
2762 }
2763 
2764 static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
2765 {
2766 	struct request_queue *q = hctx->queue;
2767 	long state;
2768 
2769 	/*
2770 	 * If we sleep, have the caller restart the poll loop to reset
2771 	 * the state. Like for the other success return cases, the
2772 	 * caller is responsible for checking if the IO completed. If
2773 	 * the IO isn't complete, we'll get called again and will go
2774 	 * straight to the busy poll loop.
2775 	 */
2776 	if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
2777 		return true;
2778 
2779 	hctx->poll_considered++;
2780 
2781 	state = current->state;
2782 	while (!need_resched()) {
2783 		int ret;
2784 
2785 		hctx->poll_invoked++;
2786 
2787 		ret = q->mq_ops->poll(hctx, rq->tag);
2788 		if (ret > 0) {
2789 			hctx->poll_success++;
2790 			set_current_state(TASK_RUNNING);
2791 			return true;
2792 		}
2793 
2794 		if (signal_pending_state(state, current))
2795 			set_current_state(TASK_RUNNING);
2796 
2797 		if (current->state == TASK_RUNNING)
2798 			return true;
2799 		if (ret < 0)
2800 			break;
2801 		cpu_relax();
2802 	}
2803 
2804 	return false;
2805 }
2806 
2807 bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
2808 {
2809 	struct blk_mq_hw_ctx *hctx;
2810 	struct blk_plug *plug;
2811 	struct request *rq;
2812 
2813 	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
2814 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
2815 		return false;
2816 
2817 	plug = current->plug;
2818 	if (plug)
2819 		blk_flush_plug_list(plug, false);
2820 
2821 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
2822 	if (!blk_qc_t_is_internal(cookie))
2823 		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
2824 	else {
2825 		rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
2826 		/*
2827 		 * With scheduling, if the request has completed, we'll
2828 		 * get a NULL return here, as we clear the sched tag when
2829 		 * that happens. The request still remains valid, like always,
2830 		 * so we should be safe with just the NULL check.
2831 		 */
2832 		if (!rq)
2833 			return false;
2834 	}
2835 
2836 	return __blk_mq_poll(hctx, rq);
2837 }
2838 EXPORT_SYMBOL_GPL(blk_mq_poll);
2839 
2840 static int __init blk_mq_init(void)
2841 {
2842 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
2843 				blk_mq_hctx_notify_dead);
2844 	return 0;
2845 }
2846 subsys_initcall(blk_mq_init);
2847