xref: /linux/block/mq-deadline.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
4  *  for the blk-mq scheduling framework
5  *
6  *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
7  */
8 #include <linux/kernel.h>
9 #include <linux/fs.h>
10 #include <linux/blkdev.h>
11 #include <linux/bio.h>
12 #include <linux/module.h>
13 #include <linux/slab.h>
14 #include <linux/init.h>
15 #include <linux/compiler.h>
16 #include <linux/rbtree.h>
17 #include <linux/sbitmap.h>
18 
19 #include <trace/events/block.h>
20 
21 #include "elevator.h"
22 #include "blk.h"
23 #include "blk-mq.h"
24 #include "blk-mq-debugfs.h"
25 #include "blk-mq-sched.h"
26 
27 /*
28  * See Documentation/block/deadline-iosched.rst
29  */
30 static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
31 static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
32 /*
33  * Time after which to dispatch lower priority requests even if higher
34  * priority requests are pending.
35  */
36 static const int prio_aging_expire = 10 * HZ;
37 static const int writes_starved = 2;    /* max times reads can starve a write */
38 static const int fifo_batch = 16;       /* # of sequential requests treated as one
39 				     by the above parameters. For throughput. */
40 
41 enum dd_data_dir {
42 	DD_READ		= READ,
43 	DD_WRITE	= WRITE,
44 };
45 
46 enum { DD_DIR_COUNT = 2 };
47 
48 enum dd_prio {
49 	DD_RT_PRIO	= 0,
50 	DD_BE_PRIO	= 1,
51 	DD_IDLE_PRIO	= 2,
52 	DD_PRIO_MAX	= 2,
53 };
54 
55 enum { DD_PRIO_COUNT = 3 };
56 
57 /*
58  * I/O statistics per I/O priority. It is fine if these counters overflow.
59  * What matters is that these counters are at least as wide as
60  * log2(max_outstanding_requests).
61  */
62 struct io_stats_per_prio {
63 	uint32_t inserted;
64 	uint32_t merged;
65 	uint32_t dispatched;
66 	atomic_t completed;
67 };
68 
69 /*
70  * Deadline scheduler data per I/O priority (enum dd_prio). Requests are
71  * present on both sort_list[] and fifo_list[].
72  */
73 struct dd_per_prio {
74 	struct rb_root sort_list[DD_DIR_COUNT];
75 	struct list_head fifo_list[DD_DIR_COUNT];
76 	/* Position of the most recently dispatched request. */
77 	sector_t latest_pos[DD_DIR_COUNT];
78 	struct io_stats_per_prio stats;
79 };
80 
81 struct deadline_data {
82 	/*
83 	 * run time data
84 	 */
85 
86 	struct list_head dispatch;
87 	struct dd_per_prio per_prio[DD_PRIO_COUNT];
88 
89 	/* Data direction of latest dispatched request. */
90 	enum dd_data_dir last_dir;
91 	unsigned int batching;		/* number of sequential requests made */
92 	unsigned int starved;		/* times reads have starved writes */
93 
94 	/*
95 	 * settings that change how the i/o scheduler behaves
96 	 */
97 	int fifo_expire[DD_DIR_COUNT];
98 	int fifo_batch;
99 	int writes_starved;
100 	int front_merges;
101 	int prio_aging_expire;
102 
103 	spinlock_t lock;
104 };
105 
106 /* Maps an I/O priority class to a deadline scheduler priority. */
107 static const enum dd_prio ioprio_class_to_prio[] = {
108 	[IOPRIO_CLASS_NONE]	= DD_BE_PRIO,
109 	[IOPRIO_CLASS_RT]	= DD_RT_PRIO,
110 	[IOPRIO_CLASS_BE]	= DD_BE_PRIO,
111 	[IOPRIO_CLASS_IDLE]	= DD_IDLE_PRIO,
112 };
113 
114 static inline struct rb_root *
115 deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
116 {
117 	return &per_prio->sort_list[rq_data_dir(rq)];
118 }
119 
120 /*
121  * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
122  * request.
123  */
124 static u8 dd_rq_ioclass(struct request *rq)
125 {
126 	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
127 }
128 
129 /*
130  * Return the first request for which blk_rq_pos() >= @pos.
131  */
132 static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
133 				enum dd_data_dir data_dir, sector_t pos)
134 {
135 	struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
136 	struct request *rq, *res = NULL;
137 
138 	while (node) {
139 		rq = rb_entry_rq(node);
140 		if (blk_rq_pos(rq) >= pos) {
141 			res = rq;
142 			node = node->rb_left;
143 		} else {
144 			node = node->rb_right;
145 		}
146 	}
147 	return res;
148 }
149 
150 static void
151 deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
152 {
153 	struct rb_root *root = deadline_rb_root(per_prio, rq);
154 
155 	elv_rb_add(root, rq);
156 }
157 
158 static inline void
159 deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
160 {
161 	elv_rb_del(deadline_rb_root(per_prio, rq), rq);
162 }
163 
164 /*
165  * remove rq from rbtree and fifo.
166  */
167 static void deadline_remove_request(struct request_queue *q,
168 				    struct dd_per_prio *per_prio,
169 				    struct request *rq)
170 {
171 	list_del_init(&rq->queuelist);
172 
173 	/*
174 	 * We might not be on the rbtree, if we are doing an insert merge
175 	 */
176 	if (!RB_EMPTY_NODE(&rq->rb_node))
177 		deadline_del_rq_rb(per_prio, rq);
178 
179 	elv_rqhash_del(q, rq);
180 	if (q->last_merge == rq)
181 		q->last_merge = NULL;
182 }
183 
184 static void dd_request_merged(struct request_queue *q, struct request *req,
185 			      enum elv_merge type)
186 {
187 	struct deadline_data *dd = q->elevator->elevator_data;
188 	const u8 ioprio_class = dd_rq_ioclass(req);
189 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
190 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
191 
192 	/*
193 	 * if the merge was a front merge, we need to reposition request
194 	 */
195 	if (type == ELEVATOR_FRONT_MERGE) {
196 		elv_rb_del(deadline_rb_root(per_prio, req), req);
197 		deadline_add_rq_rb(per_prio, req);
198 	}
199 }
200 
201 /*
202  * Callback function that is invoked after @next has been merged into @req.
203  */
204 static void dd_merged_requests(struct request_queue *q, struct request *req,
205 			       struct request *next)
206 {
207 	struct deadline_data *dd = q->elevator->elevator_data;
208 	const u8 ioprio_class = dd_rq_ioclass(next);
209 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
210 
211 	lockdep_assert_held(&dd->lock);
212 
213 	dd->per_prio[prio].stats.merged++;
214 
215 	/*
216 	 * if next expires before rq, assign its expire time to rq
217 	 * and move into next position (next will be deleted) in fifo
218 	 */
219 	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
220 		if (time_before((unsigned long)next->fifo_time,
221 				(unsigned long)req->fifo_time)) {
222 			list_move(&req->queuelist, &next->queuelist);
223 			req->fifo_time = next->fifo_time;
224 		}
225 	}
226 
227 	/*
228 	 * kill knowledge of next, this one is a goner
229 	 */
230 	deadline_remove_request(q, &dd->per_prio[prio], next);
231 }
232 
233 /*
234  * move an entry to dispatch queue
235  */
236 static void
237 deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
238 		      struct request *rq)
239 {
240 	/*
241 	 * take it off the sort and fifo list
242 	 */
243 	deadline_remove_request(rq->q, per_prio, rq);
244 }
245 
246 /* Number of requests queued for a given priority level. */
247 static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
248 {
249 	const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
250 
251 	lockdep_assert_held(&dd->lock);
252 
253 	return stats->inserted - atomic_read(&stats->completed);
254 }
255 
256 /*
257  * deadline_check_fifo returns true if and only if there are expired requests
258  * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]).
259  */
260 static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
261 				       enum dd_data_dir data_dir)
262 {
263 	struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
264 
265 	return time_is_before_eq_jiffies((unsigned long)rq->fifo_time);
266 }
267 
268 /*
269  * For the specified data direction, return the next request to
270  * dispatch using arrival ordered lists.
271  */
272 static struct request *
273 deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
274 		      enum dd_data_dir data_dir)
275 {
276 	if (list_empty(&per_prio->fifo_list[data_dir]))
277 		return NULL;
278 
279 	return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
280 }
281 
282 /*
283  * For the specified data direction, return the next request to
284  * dispatch using sector position sorted lists.
285  */
286 static struct request *
287 deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
288 		      enum dd_data_dir data_dir)
289 {
290 	return deadline_from_pos(per_prio, data_dir,
291 				 per_prio->latest_pos[data_dir]);
292 }
293 
294 /*
295  * Returns true if and only if @rq started after @latest_start where
296  * @latest_start is in jiffies.
297  */
298 static bool started_after(struct deadline_data *dd, struct request *rq,
299 			  unsigned long latest_start)
300 {
301 	unsigned long start_time = (unsigned long)rq->fifo_time;
302 
303 	start_time -= dd->fifo_expire[rq_data_dir(rq)];
304 
305 	return time_after(start_time, latest_start);
306 }
307 
308 static struct request *dd_start_request(struct deadline_data *dd,
309 					enum dd_data_dir data_dir,
310 					struct request *rq)
311 {
312 	u8 ioprio_class = dd_rq_ioclass(rq);
313 	enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
314 
315 	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
316 	dd->per_prio[prio].stats.dispatched++;
317 	rq->rq_flags |= RQF_STARTED;
318 	return rq;
319 }
320 
321 /*
322  * deadline_dispatch_requests selects the best request according to
323  * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
324  */
325 static struct request *__dd_dispatch_request(struct deadline_data *dd,
326 					     struct dd_per_prio *per_prio,
327 					     unsigned long latest_start)
328 {
329 	struct request *rq, *next_rq;
330 	enum dd_data_dir data_dir;
331 
332 	lockdep_assert_held(&dd->lock);
333 
334 	/*
335 	 * batches are currently reads XOR writes
336 	 */
337 	rq = deadline_next_request(dd, per_prio, dd->last_dir);
338 	if (rq && dd->batching < dd->fifo_batch) {
339 		/* we have a next request and are still entitled to batch */
340 		data_dir = rq_data_dir(rq);
341 		goto dispatch_request;
342 	}
343 
344 	/*
345 	 * at this point we are not running a batch. select the appropriate
346 	 * data direction (read / write)
347 	 */
348 
349 	if (!list_empty(&per_prio->fifo_list[DD_READ])) {
350 		BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ]));
351 
352 		if (deadline_fifo_request(dd, per_prio, DD_WRITE) &&
353 		    (dd->starved++ >= dd->writes_starved))
354 			goto dispatch_writes;
355 
356 		data_dir = DD_READ;
357 
358 		goto dispatch_find_request;
359 	}
360 
361 	/*
362 	 * there are either no reads or writes have been starved
363 	 */
364 
365 	if (!list_empty(&per_prio->fifo_list[DD_WRITE])) {
366 dispatch_writes:
367 		BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE]));
368 
369 		dd->starved = 0;
370 
371 		data_dir = DD_WRITE;
372 
373 		goto dispatch_find_request;
374 	}
375 
376 	return NULL;
377 
378 dispatch_find_request:
379 	/*
380 	 * we are not running a batch, find best request for selected data_dir
381 	 */
382 	next_rq = deadline_next_request(dd, per_prio, data_dir);
383 	if (deadline_check_fifo(per_prio, data_dir) || !next_rq) {
384 		/*
385 		 * A deadline has expired, the last request was in the other
386 		 * direction, or we have run out of higher-sectored requests.
387 		 * Start again from the request with the earliest expiry time.
388 		 */
389 		rq = deadline_fifo_request(dd, per_prio, data_dir);
390 	} else {
391 		/*
392 		 * The last req was the same dir and we have a next request in
393 		 * sort order. No expired requests so continue on from here.
394 		 */
395 		rq = next_rq;
396 	}
397 
398 	if (!rq)
399 		return NULL;
400 
401 	dd->last_dir = data_dir;
402 	dd->batching = 0;
403 
404 dispatch_request:
405 	if (started_after(dd, rq, latest_start))
406 		return NULL;
407 
408 	/*
409 	 * rq is the selected appropriate request.
410 	 */
411 	dd->batching++;
412 	deadline_move_request(dd, per_prio, rq);
413 	return dd_start_request(dd, data_dir, rq);
414 }
415 
416 /*
417  * Check whether there are any requests with priority other than DD_RT_PRIO
418  * that were inserted more than prio_aging_expire jiffies ago.
419  */
420 static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
421 						      unsigned long now)
422 {
423 	struct request *rq;
424 	enum dd_prio prio;
425 	int prio_cnt;
426 
427 	lockdep_assert_held(&dd->lock);
428 
429 	prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
430 		   !!dd_queued(dd, DD_IDLE_PRIO);
431 	if (prio_cnt < 2)
432 		return NULL;
433 
434 	for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
435 		rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
436 					   now - dd->prio_aging_expire);
437 		if (rq)
438 			return rq;
439 	}
440 
441 	return NULL;
442 }
443 
444 /*
445  * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
446  *
447  * One confusing aspect here is that we get called for a specific
448  * hardware queue, but we may return a request that is for a
449  * different hardware queue. This is because mq-deadline has shared
450  * state for all hardware queues, in terms of sorting, FIFOs, etc.
451  */
452 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
453 {
454 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
455 	const unsigned long now = jiffies;
456 	struct request *rq;
457 	enum dd_prio prio;
458 
459 	spin_lock(&dd->lock);
460 
461 	if (!list_empty(&dd->dispatch)) {
462 		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
463 		list_del_init(&rq->queuelist);
464 		dd_start_request(dd, rq_data_dir(rq), rq);
465 		goto unlock;
466 	}
467 
468 	rq = dd_dispatch_prio_aged_requests(dd, now);
469 	if (rq)
470 		goto unlock;
471 
472 	/*
473 	 * Next, dispatch requests in priority order. Ignore lower priority
474 	 * requests if any higher priority requests are pending.
475 	 */
476 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
477 		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
478 		if (rq || dd_queued(dd, prio))
479 			break;
480 	}
481 
482 unlock:
483 	spin_unlock(&dd->lock);
484 
485 	return rq;
486 }
487 
488 static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
489 {
490 	if (!blk_mq_is_sync_read(opf))
491 		data->shallow_depth = data->q->async_depth;
492 }
493 
494 /* Called by blk_mq_init_sched() and blk_mq_update_nr_requests(). */
495 static void dd_depth_updated(struct request_queue *q)
496 {
497 	blk_mq_set_min_shallow_depth(q, q->async_depth);
498 }
499 
500 static void dd_exit_sched(struct elevator_queue *e)
501 {
502 	struct deadline_data *dd = e->elevator_data;
503 	enum dd_prio prio;
504 
505 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
506 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
507 		const struct io_stats_per_prio *stats = &per_prio->stats;
508 		uint32_t queued;
509 
510 		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
511 		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
512 
513 		spin_lock(&dd->lock);
514 		queued = dd_queued(dd, prio);
515 		spin_unlock(&dd->lock);
516 
517 		WARN_ONCE(queued != 0,
518 			  "statistics for priority %d: i %u m %u d %u c %u\n",
519 			  prio, stats->inserted, stats->merged,
520 			  stats->dispatched, atomic_read(&stats->completed));
521 	}
522 
523 	kfree(dd);
524 }
525 
526 /*
527  * initialize elevator private data (deadline_data).
528  */
529 static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
530 {
531 	struct deadline_data *dd;
532 	enum dd_prio prio;
533 
534 	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
535 	if (!dd)
536 		return -ENOMEM;
537 
538 	eq->elevator_data = dd;
539 
540 	INIT_LIST_HEAD(&dd->dispatch);
541 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
542 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
543 
544 		INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]);
545 		INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]);
546 		per_prio->sort_list[DD_READ] = RB_ROOT;
547 		per_prio->sort_list[DD_WRITE] = RB_ROOT;
548 	}
549 	dd->fifo_expire[DD_READ] = read_expire;
550 	dd->fifo_expire[DD_WRITE] = write_expire;
551 	dd->writes_starved = writes_starved;
552 	dd->front_merges = 1;
553 	dd->last_dir = DD_WRITE;
554 	dd->fifo_batch = fifo_batch;
555 	dd->prio_aging_expire = prio_aging_expire;
556 	spin_lock_init(&dd->lock);
557 
558 	/* We dispatch from request queue wide instead of hw queue */
559 	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
560 
561 	q->elevator = eq;
562 	q->async_depth = q->nr_requests;
563 	dd_depth_updated(q);
564 	return 0;
565 }
566 
567 /*
568  * Try to merge @bio into an existing request. If @bio has been merged into
569  * an existing request, store the pointer to that request into *@rq.
570  */
571 static int dd_request_merge(struct request_queue *q, struct request **rq,
572 			    struct bio *bio)
573 {
574 	struct deadline_data *dd = q->elevator->elevator_data;
575 	const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
576 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
577 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
578 	sector_t sector = bio_end_sector(bio);
579 	struct request *__rq;
580 
581 	if (!dd->front_merges)
582 		return ELEVATOR_NO_MERGE;
583 
584 	__rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector);
585 	if (__rq) {
586 		BUG_ON(sector != blk_rq_pos(__rq));
587 
588 		if (elv_bio_merge_ok(__rq, bio)) {
589 			*rq = __rq;
590 			if (blk_discard_mergable(__rq))
591 				return ELEVATOR_DISCARD_MERGE;
592 			return ELEVATOR_FRONT_MERGE;
593 		}
594 	}
595 
596 	return ELEVATOR_NO_MERGE;
597 }
598 
599 /*
600  * Attempt to merge a bio into an existing request. This function is called
601  * before @bio is associated with a request.
602  */
603 static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
604 		unsigned int nr_segs)
605 {
606 	struct deadline_data *dd = q->elevator->elevator_data;
607 	struct request *free = NULL;
608 	bool ret;
609 
610 	spin_lock(&dd->lock);
611 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
612 	spin_unlock(&dd->lock);
613 
614 	if (free)
615 		blk_mq_free_request(free);
616 
617 	return ret;
618 }
619 
620 /*
621  * add rq to rbtree and fifo
622  */
623 static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
624 			      blk_insert_t flags, struct list_head *free)
625 {
626 	struct request_queue *q = hctx->queue;
627 	struct deadline_data *dd = q->elevator->elevator_data;
628 	const enum dd_data_dir data_dir = rq_data_dir(rq);
629 	u16 ioprio = req_get_ioprio(rq);
630 	u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
631 	struct dd_per_prio *per_prio;
632 	enum dd_prio prio;
633 
634 	lockdep_assert_held(&dd->lock);
635 
636 	prio = ioprio_class_to_prio[ioprio_class];
637 	per_prio = &dd->per_prio[prio];
638 	if (!rq->elv.priv[0])
639 		per_prio->stats.inserted++;
640 	rq->elv.priv[0] = per_prio;
641 
642 	if (blk_mq_sched_try_insert_merge(q, rq, free))
643 		return;
644 
645 	trace_block_rq_insert(rq);
646 
647 	if (flags & BLK_MQ_INSERT_AT_HEAD) {
648 		list_add(&rq->queuelist, &dd->dispatch);
649 		rq->fifo_time = jiffies;
650 	} else {
651 		deadline_add_rq_rb(per_prio, rq);
652 
653 		if (rq_mergeable(rq)) {
654 			elv_rqhash_add(q, rq);
655 			if (!q->last_merge)
656 				q->last_merge = rq;
657 		}
658 
659 		/*
660 		 * set expire time and add to fifo list
661 		 */
662 		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
663 		list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
664 	}
665 }
666 
667 /*
668  * Called from blk_mq_insert_request() or blk_mq_dispatch_list().
669  */
670 static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
671 			       struct list_head *list,
672 			       blk_insert_t flags)
673 {
674 	struct request_queue *q = hctx->queue;
675 	struct deadline_data *dd = q->elevator->elevator_data;
676 	LIST_HEAD(free);
677 
678 	spin_lock(&dd->lock);
679 	while (!list_empty(list)) {
680 		struct request *rq;
681 
682 		rq = list_first_entry(list, struct request, queuelist);
683 		list_del_init(&rq->queuelist);
684 		dd_insert_request(hctx, rq, flags, &free);
685 	}
686 	spin_unlock(&dd->lock);
687 
688 	blk_mq_free_requests(&free);
689 }
690 
691 /* Callback from inside blk_mq_rq_ctx_init(). */
692 static void dd_prepare_request(struct request *rq)
693 {
694 	rq->elv.priv[0] = NULL;
695 }
696 
697 /*
698  * Callback from inside blk_mq_free_request().
699  */
700 static void dd_finish_request(struct request *rq)
701 {
702 	struct dd_per_prio *per_prio = rq->elv.priv[0];
703 
704 	/*
705 	 * The block layer core may call dd_finish_request() without having
706 	 * called dd_insert_requests(). Skip requests that bypassed I/O
707 	 * scheduling. See also blk_mq_request_bypass_insert().
708 	 */
709 	if (per_prio)
710 		atomic_inc(&per_prio->stats.completed);
711 }
712 
713 static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
714 {
715 	return !list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
716 		!list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
717 }
718 
719 static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
720 {
721 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
722 	enum dd_prio prio;
723 
724 	if (!list_empty_careful(&dd->dispatch))
725 		return true;
726 
727 	for (prio = 0; prio <= DD_PRIO_MAX; prio++)
728 		if (dd_has_work_for_prio(&dd->per_prio[prio]))
729 			return true;
730 
731 	return false;
732 }
733 
734 /*
735  * sysfs parts below
736  */
737 #define SHOW_INT(__FUNC, __VAR)						\
738 static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
739 {									\
740 	struct deadline_data *dd = e->elevator_data;			\
741 									\
742 	return sysfs_emit(page, "%d\n", __VAR);				\
743 }
744 #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
745 SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
746 SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
747 SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
748 SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
749 SHOW_INT(deadline_front_merges_show, dd->front_merges);
750 SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
751 #undef SHOW_INT
752 #undef SHOW_JIFFIES
753 
754 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
755 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
756 {									\
757 	struct deadline_data *dd = e->elevator_data;			\
758 	int __data, __ret;						\
759 									\
760 	__ret = kstrtoint(page, 0, &__data);				\
761 	if (__ret < 0)							\
762 		return __ret;						\
763 	if (__data < (MIN))						\
764 		__data = (MIN);						\
765 	else if (__data > (MAX))					\
766 		__data = (MAX);						\
767 	*(__PTR) = __CONV(__data);					\
768 	return count;							\
769 }
770 #define STORE_INT(__FUNC, __PTR, MIN, MAX)				\
771 	STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, )
772 #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX)				\
773 	STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
774 STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
775 STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
776 STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
777 STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
778 STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
779 STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
780 #undef STORE_FUNCTION
781 #undef STORE_INT
782 #undef STORE_JIFFIES
783 
784 #define DD_ATTR(name) \
785 	__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
786 
787 static const struct elv_fs_entry deadline_attrs[] = {
788 	DD_ATTR(read_expire),
789 	DD_ATTR(write_expire),
790 	DD_ATTR(writes_starved),
791 	DD_ATTR(front_merges),
792 	DD_ATTR(fifo_batch),
793 	DD_ATTR(prio_aging_expire),
794 	__ATTR_NULL
795 };
796 
797 #ifdef CONFIG_BLK_DEBUG_FS
798 #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name)		\
799 static void *deadline_##name##_fifo_start(struct seq_file *m,		\
800 					  loff_t *pos)			\
801 	__acquires(&dd->lock)						\
802 {									\
803 	struct request_queue *q = m->private;				\
804 	struct deadline_data *dd = q->elevator->elevator_data;		\
805 	struct dd_per_prio *per_prio = &dd->per_prio[prio];		\
806 									\
807 	spin_lock(&dd->lock);						\
808 	return seq_list_start(&per_prio->fifo_list[data_dir], *pos);	\
809 }									\
810 									\
811 static void *deadline_##name##_fifo_next(struct seq_file *m, void *v,	\
812 					 loff_t *pos)			\
813 {									\
814 	struct request_queue *q = m->private;				\
815 	struct deadline_data *dd = q->elevator->elevator_data;		\
816 	struct dd_per_prio *per_prio = &dd->per_prio[prio];		\
817 									\
818 	return seq_list_next(v, &per_prio->fifo_list[data_dir], pos);	\
819 }									\
820 									\
821 static void deadline_##name##_fifo_stop(struct seq_file *m, void *v)	\
822 	__releases(&dd->lock)						\
823 {									\
824 	struct request_queue *q = m->private;				\
825 	struct deadline_data *dd = q->elevator->elevator_data;		\
826 									\
827 	spin_unlock(&dd->lock);						\
828 }									\
829 									\
830 static const struct seq_operations deadline_##name##_fifo_seq_ops = {	\
831 	.start	= deadline_##name##_fifo_start,				\
832 	.next	= deadline_##name##_fifo_next,				\
833 	.stop	= deadline_##name##_fifo_stop,				\
834 	.show	= blk_mq_debugfs_rq_show,				\
835 };									\
836 									\
837 static int deadline_##name##_next_rq_show(void *data,			\
838 					  struct seq_file *m)		\
839 {									\
840 	struct request_queue *q = data;					\
841 	struct deadline_data *dd = q->elevator->elevator_data;		\
842 	struct dd_per_prio *per_prio = &dd->per_prio[prio];		\
843 	struct request *rq;						\
844 									\
845 	rq = deadline_from_pos(per_prio, data_dir,			\
846 			       per_prio->latest_pos[data_dir]);		\
847 	if (rq)								\
848 		__blk_mq_debugfs_rq_show(m, rq);			\
849 	return 0;							\
850 }
851 
852 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0);
853 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0);
854 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1);
855 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1);
856 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2);
857 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2);
858 #undef DEADLINE_DEBUGFS_DDIR_ATTRS
859 
860 static int deadline_batching_show(void *data, struct seq_file *m)
861 {
862 	struct request_queue *q = data;
863 	struct deadline_data *dd = q->elevator->elevator_data;
864 
865 	seq_printf(m, "%u\n", dd->batching);
866 	return 0;
867 }
868 
869 static int deadline_starved_show(void *data, struct seq_file *m)
870 {
871 	struct request_queue *q = data;
872 	struct deadline_data *dd = q->elevator->elevator_data;
873 
874 	seq_printf(m, "%u\n", dd->starved);
875 	return 0;
876 }
877 
878 static int dd_queued_show(void *data, struct seq_file *m)
879 {
880 	struct request_queue *q = data;
881 	struct deadline_data *dd = q->elevator->elevator_data;
882 	u32 rt, be, idle;
883 
884 	spin_lock(&dd->lock);
885 	rt = dd_queued(dd, DD_RT_PRIO);
886 	be = dd_queued(dd, DD_BE_PRIO);
887 	idle = dd_queued(dd, DD_IDLE_PRIO);
888 	spin_unlock(&dd->lock);
889 
890 	seq_printf(m, "%u %u %u\n", rt, be, idle);
891 
892 	return 0;
893 }
894 
895 /* Number of requests owned by the block driver for a given priority. */
896 static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
897 {
898 	const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
899 
900 	lockdep_assert_held(&dd->lock);
901 
902 	return stats->dispatched + stats->merged -
903 		atomic_read(&stats->completed);
904 }
905 
906 static int dd_owned_by_driver_show(void *data, struct seq_file *m)
907 {
908 	struct request_queue *q = data;
909 	struct deadline_data *dd = q->elevator->elevator_data;
910 	u32 rt, be, idle;
911 
912 	spin_lock(&dd->lock);
913 	rt = dd_owned_by_driver(dd, DD_RT_PRIO);
914 	be = dd_owned_by_driver(dd, DD_BE_PRIO);
915 	idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
916 	spin_unlock(&dd->lock);
917 
918 	seq_printf(m, "%u %u %u\n", rt, be, idle);
919 
920 	return 0;
921 }
922 
923 static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos)
924 	__acquires(&dd->lock)
925 {
926 	struct request_queue *q = m->private;
927 	struct deadline_data *dd = q->elevator->elevator_data;
928 
929 	spin_lock(&dd->lock);
930 	return seq_list_start(&dd->dispatch, *pos);
931 }
932 
933 static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
934 {
935 	struct request_queue *q = m->private;
936 	struct deadline_data *dd = q->elevator->elevator_data;
937 
938 	return seq_list_next(v, &dd->dispatch, pos);
939 }
940 
941 static void deadline_dispatch_stop(struct seq_file *m, void *v)
942 	__releases(&dd->lock)
943 {
944 	struct request_queue *q = m->private;
945 	struct deadline_data *dd = q->elevator->elevator_data;
946 
947 	spin_unlock(&dd->lock);
948 }
949 
950 static const struct seq_operations deadline_dispatch_seq_ops = {
951 	.start	= deadline_dispatch_start,
952 	.next	= deadline_dispatch_next,
953 	.stop	= deadline_dispatch_stop,
954 	.show	= blk_mq_debugfs_rq_show,
955 };
956 
957 #define DEADLINE_QUEUE_DDIR_ATTRS(name)					\
958 	{#name "_fifo_list", 0400,					\
959 			.seq_ops = &deadline_##name##_fifo_seq_ops}
960 #define DEADLINE_NEXT_RQ_ATTR(name)					\
961 	{#name "_next_rq", 0400, deadline_##name##_next_rq_show}
962 static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
963 	DEADLINE_QUEUE_DDIR_ATTRS(read0),
964 	DEADLINE_QUEUE_DDIR_ATTRS(write0),
965 	DEADLINE_QUEUE_DDIR_ATTRS(read1),
966 	DEADLINE_QUEUE_DDIR_ATTRS(write1),
967 	DEADLINE_QUEUE_DDIR_ATTRS(read2),
968 	DEADLINE_QUEUE_DDIR_ATTRS(write2),
969 	DEADLINE_NEXT_RQ_ATTR(read0),
970 	DEADLINE_NEXT_RQ_ATTR(write0),
971 	DEADLINE_NEXT_RQ_ATTR(read1),
972 	DEADLINE_NEXT_RQ_ATTR(write1),
973 	DEADLINE_NEXT_RQ_ATTR(read2),
974 	DEADLINE_NEXT_RQ_ATTR(write2),
975 	{"batching", 0400, deadline_batching_show},
976 	{"starved", 0400, deadline_starved_show},
977 	{"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops},
978 	{"owned_by_driver", 0400, dd_owned_by_driver_show},
979 	{"queued", 0400, dd_queued_show},
980 	{},
981 };
982 #undef DEADLINE_QUEUE_DDIR_ATTRS
983 #endif
984 
985 static struct elevator_type mq_deadline = {
986 	.ops = {
987 		.depth_updated		= dd_depth_updated,
988 		.limit_depth		= dd_limit_depth,
989 		.insert_requests	= dd_insert_requests,
990 		.dispatch_request	= dd_dispatch_request,
991 		.prepare_request	= dd_prepare_request,
992 		.finish_request		= dd_finish_request,
993 		.next_request		= elv_rb_latter_request,
994 		.former_request		= elv_rb_former_request,
995 		.bio_merge		= dd_bio_merge,
996 		.request_merge		= dd_request_merge,
997 		.requests_merged	= dd_merged_requests,
998 		.request_merged		= dd_request_merged,
999 		.has_work		= dd_has_work,
1000 		.init_sched		= dd_init_sched,
1001 		.exit_sched		= dd_exit_sched,
1002 	},
1003 
1004 #ifdef CONFIG_BLK_DEBUG_FS
1005 	.queue_debugfs_attrs = deadline_queue_debugfs_attrs,
1006 #endif
1007 	.elevator_attrs = deadline_attrs,
1008 	.elevator_name = "mq-deadline",
1009 	.elevator_alias = "deadline",
1010 	.elevator_owner = THIS_MODULE,
1011 };
1012 MODULE_ALIAS("mq-deadline-iosched");
1013 
1014 static int __init deadline_init(void)
1015 {
1016 	return elv_register(&mq_deadline);
1017 }
1018 
1019 static void __exit deadline_exit(void)
1020 {
1021 	elv_unregister(&mq_deadline);
1022 }
1023 
1024 module_init(deadline_init);
1025 module_exit(deadline_exit);
1026 
1027 MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche");
1028 MODULE_LICENSE("GPL");
1029 MODULE_DESCRIPTION("MQ deadline IO scheduler");
1030