xref: /linux/block/blk-iocost.c (revision 37744feebc086908fd89760650f458ab19071750)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * paramters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * 2. Control Strategy
50  *
51  * The device virtual time (vtime) is used as the primary control metric.
52  * The control strategy is composed of the following three parts.
53  *
54  * 2-1. Vtime Distribution
55  *
56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
57  * calculated.  Please consider the following hierarchy where the numbers
58  * inside parentheses denote the configured weights.
59  *
60  *           root
61  *         /       \
62  *      A (w:100)  B (w:300)
63  *      /       \
64  *  A0 (w:100)  A1 (w:100)
65  *
66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69  * 12.5% each.  The distribution mechanism only cares about these flattened
70  * shares.  They're called hweights (hierarchical weights) and always add
71  * upto 1 (HWEIGHT_WHOLE).
72  *
73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75  * against the device vtime - an IO which takes 10ms on the underlying
76  * device is considered to take 80ms on A0.
77  *
78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
80  * the vtime consumed by past IOs and can issue a new IO iff doing so
81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
82  * suspended until the vtime has progressed enough to cover it.
83  *
84  * 2-2. Vrate Adjustment
85  *
86  * It's unrealistic to expect the cost model to be perfect.  There are too
87  * many devices and even on the same device the overall performance
88  * fluctuates depending on numerous factors such as IO mixture and device
89  * internal garbage collection.  The controller needs to adapt dynamically.
90  *
91  * This is achieved by adjusting the overall IO rate according to how busy
92  * the device is.  If the device becomes overloaded, we're sending down too
93  * many IOs and should generally slow down.  If there are waiting issuers
94  * but the device isn't saturated, we're issuing too few and should
95  * generally speed up.
96  *
97  * To slow down, we lower the vrate - the rate at which the device vtime
98  * passes compared to the wall clock.  For example, if the vtime is running
99  * at the vrate of 75%, all cgroups added up would only be able to issue
100  * 750ms worth of IOs per second, and vice-versa for speeding up.
101  *
102  * Device business is determined using two criteria - rq wait and
103  * completion latencies.
104  *
105  * When a device gets saturated, the on-device and then the request queues
106  * fill up and a bio which is ready to be issued has to wait for a request
107  * to become available.  When this delay becomes noticeable, it's a clear
108  * indication that the device is saturated and we lower the vrate.  This
109  * saturation signal is fairly conservative as it only triggers when both
110  * hardware and software queues are filled up, and is used as the default
111  * busy signal.
112  *
113  * As devices can have deep queues and be unfair in how the queued commands
114  * are executed, soley depending on rq wait may not result in satisfactory
115  * control quality.  For a better control quality, completion latency QoS
116  * parameters can be configured so that the device is considered saturated
117  * if N'th percentile completion latency rises above the set point.
118  *
119  * The completion latency requirements are a function of both the
120  * underlying device characteristics and the desired IO latency quality of
121  * service.  There is an inherent trade-off - the tighter the latency QoS,
122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
123  * and can be set through /sys/fs/cgroup/io.cost.qos.
124  *
125  * 2-3. Work Conservation
126  *
127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
128  * periodically while B is sending out enough parallel IOs to saturate the
129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
130  * cost per second, i.e., 10% of the device capacity.  The naive
131  * distribution of half and half would lead to 60% utilization of the
132  * device, a significant reduction in the total amount of work done
133  * compared to free-for-all competition.  This is too high a cost to pay
134  * for IO control.
135  *
136  * To conserve the total amount of work done, we keep track of how much
137  * each active cgroup is actually using and yield part of its weight if
138  * there are other cgroups which can make use of it.  In the above case,
139  * A's weight will be lowered so that it hovers above the actual usage and
140  * B would be able to use the rest.
141  *
142  * As we don't want to penalize a cgroup for donating its weight, the
143  * surplus weight adjustment factors in a margin and has an immediate
144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
145  *
146  * Note that adjusting down surplus weights has the same effects as
147  * accelerating vtime for other cgroups and work conservation can also be
148  * implemented by adjusting vrate dynamically.  However, squaring who can
149  * donate and should take back how much requires hweight propagations
150  * anyway making it easier to implement and understand as a separate
151  * mechanism.
152  *
153  * 3. Monitoring
154  *
155  * Instead of debugfs or other clumsy monitoring mechanisms, this
156  * controller uses a drgn based monitoring script -
157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
158  * https://github.com/osandov/drgn.  The ouput looks like the following.
159  *
160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
161  *                 active      weight      hweight% inflt% dbt  delay usages%
162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
164  *
165  * - per	: Timer period
166  * - cur_per	: Internal wall and device vtime clock
167  * - vrate	: Device virtual time rate against wall clock
168  * - weight	: Surplus-adjusted and configured weights
169  * - hweight	: Surplus-adjusted and configured hierarchical weights
170  * - inflt	: The percentage of in-flight IO cost at the end of last period
171  * - del_ms	: Deferred issuer delay induction level and duration
172  * - usages	: Usage history
173  */
174 
175 #include <linux/kernel.h>
176 #include <linux/module.h>
177 #include <linux/timer.h>
178 #include <linux/time64.h>
179 #include <linux/parser.h>
180 #include <linux/sched/signal.h>
181 #include <linux/blk-cgroup.h>
182 #include "blk-rq-qos.h"
183 #include "blk-stat.h"
184 #include "blk-wbt.h"
185 
186 #ifdef CONFIG_TRACEPOINTS
187 
188 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
189 #define TRACE_IOCG_PATH_LEN 1024
190 static DEFINE_SPINLOCK(trace_iocg_path_lock);
191 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
192 
193 #define TRACE_IOCG_PATH(type, iocg, ...)					\
194 	do {									\
195 		unsigned long flags;						\
196 		if (trace_iocost_##type##_enabled()) {				\
197 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
198 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
199 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
200 			trace_iocost_##type(iocg, trace_iocg_path,		\
201 					      ##__VA_ARGS__);			\
202 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
203 		}								\
204 	} while (0)
205 
206 #else	/* CONFIG_TRACE_POINTS */
207 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
208 #endif	/* CONFIG_TRACE_POINTS */
209 
210 enum {
211 	MILLION			= 1000000,
212 
213 	/* timer period is calculated from latency requirements, bound it */
214 	MIN_PERIOD		= USEC_PER_MSEC,
215 	MAX_PERIOD		= USEC_PER_SEC,
216 
217 	/*
218 	 * A cgroup's vtime can run 50% behind the device vtime, which
219 	 * serves as its IO credit buffer.  Surplus weight adjustment is
220 	 * immediately canceled if the vtime margin runs below 10%.
221 	 */
222 	MARGIN_PCT		= 50,
223 	INUSE_MARGIN_PCT	= 10,
224 
225 	/* Have some play in waitq timer operations */
226 	WAITQ_TIMER_MARGIN_PCT	= 5,
227 
228 	/*
229 	 * vtime can wrap well within a reasonable uptime when vrate is
230 	 * consistently raised.  Don't trust recorded cgroup vtime if the
231 	 * period counter indicates that it's older than 5mins.
232 	 */
233 	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
234 
235 	/*
236 	 * Remember the past three non-zero usages and use the max for
237 	 * surplus calculation.  Three slots guarantee that we remember one
238 	 * full period usage from the last active stretch even after
239 	 * partial deactivation and re-activation periods.  Don't start
240 	 * giving away weight before collecting two data points to prevent
241 	 * hweight adjustments based on one partial activation period.
242 	 */
243 	NR_USAGE_SLOTS		= 3,
244 	MIN_VALID_USAGES	= 2,
245 
246 	/* 1/64k is granular enough and can easily be handled w/ u32 */
247 	HWEIGHT_WHOLE		= 1 << 16,
248 
249 	/*
250 	 * As vtime is used to calculate the cost of each IO, it needs to
251 	 * be fairly high precision.  For example, it should be able to
252 	 * represent the cost of a single page worth of discard with
253 	 * suffificient accuracy.  At the same time, it should be able to
254 	 * represent reasonably long enough durations to be useful and
255 	 * convenient during operation.
256 	 *
257 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
258 	 * granularity and days of wrap-around time even at extreme vrates.
259 	 */
260 	VTIME_PER_SEC_SHIFT	= 37,
261 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
262 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
263 
264 	/* bound vrate adjustments within two orders of magnitude */
265 	VRATE_MIN_PPM		= 10000,	/* 1% */
266 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
267 
268 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
269 	VRATE_CLAMP_ADJ_PCT	= 4,
270 
271 	/* if IOs end up waiting for requests, issue less */
272 	RQ_WAIT_BUSY_PCT	= 5,
273 
274 	/* unbusy hysterisis */
275 	UNBUSY_THR_PCT		= 75,
276 
277 	/* don't let cmds which take a very long time pin lagging for too long */
278 	MAX_LAGGING_PERIODS	= 10,
279 
280 	/*
281 	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
282 	 * donate the surplus.
283 	 */
284 	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
285 	SURPLUS_SCALE_ABS	= HWEIGHT_WHOLE / 50,	/* + 2% */
286 	SURPLUS_MIN_ADJ_DELTA	= HWEIGHT_WHOLE / 33,	/* 3% */
287 
288 	/* switch iff the conditions are met for longer than this */
289 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
290 
291 	/*
292 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
293 	 * size-proportional components of cost calculation in closer
294 	 * numbers of digits to per-IO cost components.
295 	 */
296 	IOC_PAGE_SHIFT		= 12,
297 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
298 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
299 
300 	/* if apart further than 16M, consider randio for linear model */
301 	LCOEF_RANDIO_PAGES	= 4096,
302 };
303 
304 enum ioc_running {
305 	IOC_IDLE,
306 	IOC_RUNNING,
307 	IOC_STOP,
308 };
309 
310 /* io.cost.qos controls including per-dev enable of the whole controller */
311 enum {
312 	QOS_ENABLE,
313 	QOS_CTRL,
314 	NR_QOS_CTRL_PARAMS,
315 };
316 
317 /* io.cost.qos params */
318 enum {
319 	QOS_RPPM,
320 	QOS_RLAT,
321 	QOS_WPPM,
322 	QOS_WLAT,
323 	QOS_MIN,
324 	QOS_MAX,
325 	NR_QOS_PARAMS,
326 };
327 
328 /* io.cost.model controls */
329 enum {
330 	COST_CTRL,
331 	COST_MODEL,
332 	NR_COST_CTRL_PARAMS,
333 };
334 
335 /* builtin linear cost model coefficients */
336 enum {
337 	I_LCOEF_RBPS,
338 	I_LCOEF_RSEQIOPS,
339 	I_LCOEF_RRANDIOPS,
340 	I_LCOEF_WBPS,
341 	I_LCOEF_WSEQIOPS,
342 	I_LCOEF_WRANDIOPS,
343 	NR_I_LCOEFS,
344 };
345 
346 enum {
347 	LCOEF_RPAGE,
348 	LCOEF_RSEQIO,
349 	LCOEF_RRANDIO,
350 	LCOEF_WPAGE,
351 	LCOEF_WSEQIO,
352 	LCOEF_WRANDIO,
353 	NR_LCOEFS,
354 };
355 
356 enum {
357 	AUTOP_INVALID,
358 	AUTOP_HDD,
359 	AUTOP_SSD_QD1,
360 	AUTOP_SSD_DFL,
361 	AUTOP_SSD_FAST,
362 };
363 
364 struct ioc_gq;
365 
366 struct ioc_params {
367 	u32				qos[NR_QOS_PARAMS];
368 	u64				i_lcoefs[NR_I_LCOEFS];
369 	u64				lcoefs[NR_LCOEFS];
370 	u32				too_fast_vrate_pct;
371 	u32				too_slow_vrate_pct;
372 };
373 
374 struct ioc_missed {
375 	u32				nr_met;
376 	u32				nr_missed;
377 	u32				last_met;
378 	u32				last_missed;
379 };
380 
381 struct ioc_pcpu_stat {
382 	struct ioc_missed		missed[2];
383 
384 	u64				rq_wait_ns;
385 	u64				last_rq_wait_ns;
386 };
387 
388 /* per device */
389 struct ioc {
390 	struct rq_qos			rqos;
391 
392 	bool				enabled;
393 
394 	struct ioc_params		params;
395 	u32				period_us;
396 	u32				margin_us;
397 	u64				vrate_min;
398 	u64				vrate_max;
399 
400 	spinlock_t			lock;
401 	struct timer_list		timer;
402 	struct list_head		active_iocgs;	/* active cgroups */
403 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
404 
405 	enum ioc_running		running;
406 	atomic64_t			vtime_rate;
407 
408 	seqcount_t			period_seqcount;
409 	u32				period_at;	/* wallclock starttime */
410 	u64				period_at_vtime; /* vtime starttime */
411 
412 	atomic64_t			cur_period;	/* inc'd each period */
413 	int				busy_level;	/* saturation history */
414 
415 	u64				inuse_margin_vtime;
416 	bool				weights_updated;
417 	atomic_t			hweight_gen;	/* for lazy hweights */
418 
419 	u64				autop_too_fast_at;
420 	u64				autop_too_slow_at;
421 	int				autop_idx;
422 	bool				user_qos_params:1;
423 	bool				user_cost_model:1;
424 };
425 
426 /* per device-cgroup pair */
427 struct ioc_gq {
428 	struct blkg_policy_data		pd;
429 	struct ioc			*ioc;
430 
431 	/*
432 	 * A iocg can get its weight from two sources - an explicit
433 	 * per-device-cgroup configuration or the default weight of the
434 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
435 	 * configuration.  `weight` is the effective considering both
436 	 * sources.
437 	 *
438 	 * When an idle cgroup becomes active its `active` goes from 0 to
439 	 * `weight`.  `inuse` is the surplus adjusted active weight.
440 	 * `active` and `inuse` are used to calculate `hweight_active` and
441 	 * `hweight_inuse`.
442 	 *
443 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
444 	 * surplus adjustments.
445 	 */
446 	u32				cfg_weight;
447 	u32				weight;
448 	u32				active;
449 	u32				inuse;
450 	u32				last_inuse;
451 
452 	sector_t			cursor;		/* to detect randio */
453 
454 	/*
455 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
456 	 * issued.  If lagging behind device vtime, the delta represents
457 	 * the currently available IO budget.  If runnning ahead, the
458 	 * overage.
459 	 *
460 	 * `vtime_done` is the same but progressed on completion rather
461 	 * than issue.  The delta behind `vtime` represents the cost of
462 	 * currently in-flight IOs.
463 	 *
464 	 * `last_vtime` is used to remember `vtime` at the end of the last
465 	 * period to calculate utilization.
466 	 */
467 	atomic64_t			vtime;
468 	atomic64_t			done_vtime;
469 	u64				abs_vdebt;
470 	u64				last_vtime;
471 
472 	/*
473 	 * The period this iocg was last active in.  Used for deactivation
474 	 * and invalidating `vtime`.
475 	 */
476 	atomic64_t			active_period;
477 	struct list_head		active_list;
478 
479 	/* see __propagate_active_weight() and current_hweight() for details */
480 	u64				child_active_sum;
481 	u64				child_inuse_sum;
482 	int				hweight_gen;
483 	u32				hweight_active;
484 	u32				hweight_inuse;
485 	bool				has_surplus;
486 
487 	struct wait_queue_head		waitq;
488 	struct hrtimer			waitq_timer;
489 	struct hrtimer			delay_timer;
490 
491 	/* usage is recorded as fractions of HWEIGHT_WHOLE */
492 	int				usage_idx;
493 	u32				usages[NR_USAGE_SLOTS];
494 
495 	/* this iocg's depth in the hierarchy and ancestors including self */
496 	int				level;
497 	struct ioc_gq			*ancestors[];
498 };
499 
500 /* per cgroup */
501 struct ioc_cgrp {
502 	struct blkcg_policy_data	cpd;
503 	unsigned int			dfl_weight;
504 };
505 
506 struct ioc_now {
507 	u64				now_ns;
508 	u32				now;
509 	u64				vnow;
510 	u64				vrate;
511 };
512 
513 struct iocg_wait {
514 	struct wait_queue_entry		wait;
515 	struct bio			*bio;
516 	u64				abs_cost;
517 	bool				committed;
518 };
519 
520 struct iocg_wake_ctx {
521 	struct ioc_gq			*iocg;
522 	u32				hw_inuse;
523 	s64				vbudget;
524 };
525 
526 static const struct ioc_params autop[] = {
527 	[AUTOP_HDD] = {
528 		.qos				= {
529 			[QOS_RLAT]		=        250000, /* 250ms */
530 			[QOS_WLAT]		=        250000,
531 			[QOS_MIN]		= VRATE_MIN_PPM,
532 			[QOS_MAX]		= VRATE_MAX_PPM,
533 		},
534 		.i_lcoefs			= {
535 			[I_LCOEF_RBPS]		=     174019176,
536 			[I_LCOEF_RSEQIOPS]	=         41708,
537 			[I_LCOEF_RRANDIOPS]	=           370,
538 			[I_LCOEF_WBPS]		=     178075866,
539 			[I_LCOEF_WSEQIOPS]	=         42705,
540 			[I_LCOEF_WRANDIOPS]	=           378,
541 		},
542 	},
543 	[AUTOP_SSD_QD1] = {
544 		.qos				= {
545 			[QOS_RLAT]		=         25000, /* 25ms */
546 			[QOS_WLAT]		=         25000,
547 			[QOS_MIN]		= VRATE_MIN_PPM,
548 			[QOS_MAX]		= VRATE_MAX_PPM,
549 		},
550 		.i_lcoefs			= {
551 			[I_LCOEF_RBPS]		=     245855193,
552 			[I_LCOEF_RSEQIOPS]	=         61575,
553 			[I_LCOEF_RRANDIOPS]	=          6946,
554 			[I_LCOEF_WBPS]		=     141365009,
555 			[I_LCOEF_WSEQIOPS]	=         33716,
556 			[I_LCOEF_WRANDIOPS]	=         26796,
557 		},
558 	},
559 	[AUTOP_SSD_DFL] = {
560 		.qos				= {
561 			[QOS_RLAT]		=         25000, /* 25ms */
562 			[QOS_WLAT]		=         25000,
563 			[QOS_MIN]		= VRATE_MIN_PPM,
564 			[QOS_MAX]		= VRATE_MAX_PPM,
565 		},
566 		.i_lcoefs			= {
567 			[I_LCOEF_RBPS]		=     488636629,
568 			[I_LCOEF_RSEQIOPS]	=          8932,
569 			[I_LCOEF_RRANDIOPS]	=          8518,
570 			[I_LCOEF_WBPS]		=     427891549,
571 			[I_LCOEF_WSEQIOPS]	=         28755,
572 			[I_LCOEF_WRANDIOPS]	=         21940,
573 		},
574 		.too_fast_vrate_pct		=           500,
575 	},
576 	[AUTOP_SSD_FAST] = {
577 		.qos				= {
578 			[QOS_RLAT]		=          5000, /* 5ms */
579 			[QOS_WLAT]		=          5000,
580 			[QOS_MIN]		= VRATE_MIN_PPM,
581 			[QOS_MAX]		= VRATE_MAX_PPM,
582 		},
583 		.i_lcoefs			= {
584 			[I_LCOEF_RBPS]		=    3102524156LLU,
585 			[I_LCOEF_RSEQIOPS]	=        724816,
586 			[I_LCOEF_RRANDIOPS]	=        778122,
587 			[I_LCOEF_WBPS]		=    1742780862LLU,
588 			[I_LCOEF_WSEQIOPS]	=        425702,
589 			[I_LCOEF_WRANDIOPS]	=	 443193,
590 		},
591 		.too_slow_vrate_pct		=            10,
592 	},
593 };
594 
595 /*
596  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
597  * vtime credit shortage and down on device saturation.
598  */
599 static u32 vrate_adj_pct[] =
600 	{ 0, 0, 0, 0,
601 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
602 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
603 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
604 
605 static struct blkcg_policy blkcg_policy_iocost;
606 
607 /* accessors and helpers */
608 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
609 {
610 	return container_of(rqos, struct ioc, rqos);
611 }
612 
613 static struct ioc *q_to_ioc(struct request_queue *q)
614 {
615 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
616 }
617 
618 static const char *q_name(struct request_queue *q)
619 {
620 	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
621 		return kobject_name(q->kobj.parent);
622 	else
623 		return "<unknown>";
624 }
625 
626 static const char __maybe_unused *ioc_name(struct ioc *ioc)
627 {
628 	return q_name(ioc->rqos.q);
629 }
630 
631 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
632 {
633 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
634 }
635 
636 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
637 {
638 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
639 }
640 
641 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
642 {
643 	return pd_to_blkg(&iocg->pd);
644 }
645 
646 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
647 {
648 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
649 			    struct ioc_cgrp, cpd);
650 }
651 
652 /*
653  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
654  * weight, the more expensive each IO.  Must round up.
655  */
656 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
657 {
658 	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
659 }
660 
661 /*
662  * The inverse of abs_cost_to_cost().  Must round up.
663  */
664 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
665 {
666 	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
667 }
668 
669 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
670 {
671 	bio->bi_iocost_cost = cost;
672 	atomic64_add(cost, &iocg->vtime);
673 }
674 
675 #define CREATE_TRACE_POINTS
676 #include <trace/events/iocost.h>
677 
678 /* latency Qos params changed, update period_us and all the dependent params */
679 static void ioc_refresh_period_us(struct ioc *ioc)
680 {
681 	u32 ppm, lat, multi, period_us;
682 
683 	lockdep_assert_held(&ioc->lock);
684 
685 	/* pick the higher latency target */
686 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
687 		ppm = ioc->params.qos[QOS_RPPM];
688 		lat = ioc->params.qos[QOS_RLAT];
689 	} else {
690 		ppm = ioc->params.qos[QOS_WPPM];
691 		lat = ioc->params.qos[QOS_WLAT];
692 	}
693 
694 	/*
695 	 * We want the period to be long enough to contain a healthy number
696 	 * of IOs while short enough for granular control.  Define it as a
697 	 * multiple of the latency target.  Ideally, the multiplier should
698 	 * be scaled according to the percentile so that it would nominally
699 	 * contain a certain number of requests.  Let's be simpler and
700 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
701 	 */
702 	if (ppm)
703 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
704 	else
705 		multi = 2;
706 	period_us = multi * lat;
707 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
708 
709 	/* calculate dependent params */
710 	ioc->period_us = period_us;
711 	ioc->margin_us = period_us * MARGIN_PCT / 100;
712 	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
713 			period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
714 }
715 
716 static int ioc_autop_idx(struct ioc *ioc)
717 {
718 	int idx = ioc->autop_idx;
719 	const struct ioc_params *p = &autop[idx];
720 	u32 vrate_pct;
721 	u64 now_ns;
722 
723 	/* rotational? */
724 	if (!blk_queue_nonrot(ioc->rqos.q))
725 		return AUTOP_HDD;
726 
727 	/* handle SATA SSDs w/ broken NCQ */
728 	if (blk_queue_depth(ioc->rqos.q) == 1)
729 		return AUTOP_SSD_QD1;
730 
731 	/* use one of the normal ssd sets */
732 	if (idx < AUTOP_SSD_DFL)
733 		return AUTOP_SSD_DFL;
734 
735 	/* if user is overriding anything, maintain what was there */
736 	if (ioc->user_qos_params || ioc->user_cost_model)
737 		return idx;
738 
739 	/* step up/down based on the vrate */
740 	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
741 			      VTIME_PER_USEC);
742 	now_ns = ktime_get_ns();
743 
744 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
745 		if (!ioc->autop_too_fast_at)
746 			ioc->autop_too_fast_at = now_ns;
747 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
748 			return idx + 1;
749 	} else {
750 		ioc->autop_too_fast_at = 0;
751 	}
752 
753 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
754 		if (!ioc->autop_too_slow_at)
755 			ioc->autop_too_slow_at = now_ns;
756 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
757 			return idx - 1;
758 	} else {
759 		ioc->autop_too_slow_at = 0;
760 	}
761 
762 	return idx;
763 }
764 
765 /*
766  * Take the followings as input
767  *
768  *  @bps	maximum sequential throughput
769  *  @seqiops	maximum sequential 4k iops
770  *  @randiops	maximum random 4k iops
771  *
772  * and calculate the linear model cost coefficients.
773  *
774  *  *@page	per-page cost		1s / (@bps / 4096)
775  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
776  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
777  */
778 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
779 			u64 *page, u64 *seqio, u64 *randio)
780 {
781 	u64 v;
782 
783 	*page = *seqio = *randio = 0;
784 
785 	if (bps)
786 		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
787 					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
788 
789 	if (seqiops) {
790 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
791 		if (v > *page)
792 			*seqio = v - *page;
793 	}
794 
795 	if (randiops) {
796 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
797 		if (v > *page)
798 			*randio = v - *page;
799 	}
800 }
801 
802 static void ioc_refresh_lcoefs(struct ioc *ioc)
803 {
804 	u64 *u = ioc->params.i_lcoefs;
805 	u64 *c = ioc->params.lcoefs;
806 
807 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
808 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
809 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
810 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
811 }
812 
813 static bool ioc_refresh_params(struct ioc *ioc, bool force)
814 {
815 	const struct ioc_params *p;
816 	int idx;
817 
818 	lockdep_assert_held(&ioc->lock);
819 
820 	idx = ioc_autop_idx(ioc);
821 	p = &autop[idx];
822 
823 	if (idx == ioc->autop_idx && !force)
824 		return false;
825 
826 	if (idx != ioc->autop_idx)
827 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
828 
829 	ioc->autop_idx = idx;
830 	ioc->autop_too_fast_at = 0;
831 	ioc->autop_too_slow_at = 0;
832 
833 	if (!ioc->user_qos_params)
834 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
835 	if (!ioc->user_cost_model)
836 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
837 
838 	ioc_refresh_period_us(ioc);
839 	ioc_refresh_lcoefs(ioc);
840 
841 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
842 					    VTIME_PER_USEC, MILLION);
843 	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
844 				   VTIME_PER_USEC, MILLION);
845 
846 	return true;
847 }
848 
849 /* take a snapshot of the current [v]time and vrate */
850 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
851 {
852 	unsigned seq;
853 
854 	now->now_ns = ktime_get();
855 	now->now = ktime_to_us(now->now_ns);
856 	now->vrate = atomic64_read(&ioc->vtime_rate);
857 
858 	/*
859 	 * The current vtime is
860 	 *
861 	 *   vtime at period start + (wallclock time since the start) * vrate
862 	 *
863 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
864 	 * needed, they're seqcount protected.
865 	 */
866 	do {
867 		seq = read_seqcount_begin(&ioc->period_seqcount);
868 		now->vnow = ioc->period_at_vtime +
869 			(now->now - ioc->period_at) * now->vrate;
870 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
871 }
872 
873 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
874 {
875 	lockdep_assert_held(&ioc->lock);
876 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
877 
878 	write_seqcount_begin(&ioc->period_seqcount);
879 	ioc->period_at = now->now;
880 	ioc->period_at_vtime = now->vnow;
881 	write_seqcount_end(&ioc->period_seqcount);
882 
883 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
884 	add_timer(&ioc->timer);
885 }
886 
887 /*
888  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
889  * weight sums and propagate upwards accordingly.
890  */
891 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
892 {
893 	struct ioc *ioc = iocg->ioc;
894 	int lvl;
895 
896 	lockdep_assert_held(&ioc->lock);
897 
898 	inuse = min(active, inuse);
899 
900 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
901 		struct ioc_gq *parent = iocg->ancestors[lvl];
902 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
903 		u32 parent_active = 0, parent_inuse = 0;
904 
905 		/* update the level sums */
906 		parent->child_active_sum += (s32)(active - child->active);
907 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
908 		/* apply the udpates */
909 		child->active = active;
910 		child->inuse = inuse;
911 
912 		/*
913 		 * The delta between inuse and active sums indicates that
914 		 * that much of weight is being given away.  Parent's inuse
915 		 * and active should reflect the ratio.
916 		 */
917 		if (parent->child_active_sum) {
918 			parent_active = parent->weight;
919 			parent_inuse = DIV64_U64_ROUND_UP(
920 				parent_active * parent->child_inuse_sum,
921 				parent->child_active_sum);
922 		}
923 
924 		/* do we need to keep walking up? */
925 		if (parent_active == parent->active &&
926 		    parent_inuse == parent->inuse)
927 			break;
928 
929 		active = parent_active;
930 		inuse = parent_inuse;
931 	}
932 
933 	ioc->weights_updated = true;
934 }
935 
936 static void commit_active_weights(struct ioc *ioc)
937 {
938 	lockdep_assert_held(&ioc->lock);
939 
940 	if (ioc->weights_updated) {
941 		/* paired with rmb in current_hweight(), see there */
942 		smp_wmb();
943 		atomic_inc(&ioc->hweight_gen);
944 		ioc->weights_updated = false;
945 	}
946 }
947 
948 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
949 {
950 	__propagate_active_weight(iocg, active, inuse);
951 	commit_active_weights(iocg->ioc);
952 }
953 
954 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
955 {
956 	struct ioc *ioc = iocg->ioc;
957 	int lvl;
958 	u32 hwa, hwi;
959 	int ioc_gen;
960 
961 	/* hot path - if uptodate, use cached */
962 	ioc_gen = atomic_read(&ioc->hweight_gen);
963 	if (ioc_gen == iocg->hweight_gen)
964 		goto out;
965 
966 	/*
967 	 * Paired with wmb in commit_active_weights().  If we saw the
968 	 * updated hweight_gen, all the weight updates from
969 	 * __propagate_active_weight() are visible too.
970 	 *
971 	 * We can race with weight updates during calculation and get it
972 	 * wrong.  However, hweight_gen would have changed and a future
973 	 * reader will recalculate and we're guaranteed to discard the
974 	 * wrong result soon.
975 	 */
976 	smp_rmb();
977 
978 	hwa = hwi = HWEIGHT_WHOLE;
979 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
980 		struct ioc_gq *parent = iocg->ancestors[lvl];
981 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
982 		u32 active_sum = READ_ONCE(parent->child_active_sum);
983 		u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
984 		u32 active = READ_ONCE(child->active);
985 		u32 inuse = READ_ONCE(child->inuse);
986 
987 		/* we can race with deactivations and either may read as zero */
988 		if (!active_sum || !inuse_sum)
989 			continue;
990 
991 		active_sum = max(active, active_sum);
992 		hwa = hwa * active / active_sum;	/* max 16bits * 10000 */
993 
994 		inuse_sum = max(inuse, inuse_sum);
995 		hwi = hwi * inuse / inuse_sum;		/* max 16bits * 10000 */
996 	}
997 
998 	iocg->hweight_active = max_t(u32, hwa, 1);
999 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1000 	iocg->hweight_gen = ioc_gen;
1001 out:
1002 	if (hw_activep)
1003 		*hw_activep = iocg->hweight_active;
1004 	if (hw_inusep)
1005 		*hw_inusep = iocg->hweight_inuse;
1006 }
1007 
1008 static void weight_updated(struct ioc_gq *iocg)
1009 {
1010 	struct ioc *ioc = iocg->ioc;
1011 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1012 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1013 	u32 weight;
1014 
1015 	lockdep_assert_held(&ioc->lock);
1016 
1017 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1018 	if (weight != iocg->weight && iocg->active)
1019 		propagate_active_weight(iocg, weight,
1020 			DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1021 	iocg->weight = weight;
1022 }
1023 
1024 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1025 {
1026 	struct ioc *ioc = iocg->ioc;
1027 	u64 last_period, cur_period, max_period_delta;
1028 	u64 vtime, vmargin, vmin;
1029 	int i;
1030 
1031 	/*
1032 	 * If seem to be already active, just update the stamp to tell the
1033 	 * timer that we're still active.  We don't mind occassional races.
1034 	 */
1035 	if (!list_empty(&iocg->active_list)) {
1036 		ioc_now(ioc, now);
1037 		cur_period = atomic64_read(&ioc->cur_period);
1038 		if (atomic64_read(&iocg->active_period) != cur_period)
1039 			atomic64_set(&iocg->active_period, cur_period);
1040 		return true;
1041 	}
1042 
1043 	/* racy check on internal node IOs, treat as root level IOs */
1044 	if (iocg->child_active_sum)
1045 		return false;
1046 
1047 	spin_lock_irq(&ioc->lock);
1048 
1049 	ioc_now(ioc, now);
1050 
1051 	/* update period */
1052 	cur_period = atomic64_read(&ioc->cur_period);
1053 	last_period = atomic64_read(&iocg->active_period);
1054 	atomic64_set(&iocg->active_period, cur_period);
1055 
1056 	/* already activated or breaking leaf-only constraint? */
1057 	if (!list_empty(&iocg->active_list))
1058 		goto succeed_unlock;
1059 	for (i = iocg->level - 1; i > 0; i--)
1060 		if (!list_empty(&iocg->ancestors[i]->active_list))
1061 			goto fail_unlock;
1062 
1063 	if (iocg->child_active_sum)
1064 		goto fail_unlock;
1065 
1066 	/*
1067 	 * vtime may wrap when vrate is raised substantially due to
1068 	 * underestimated IO costs.  Look at the period and ignore its
1069 	 * vtime if the iocg has been idle for too long.  Also, cap the
1070 	 * budget it can start with to the margin.
1071 	 */
1072 	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1073 	vtime = atomic64_read(&iocg->vtime);
1074 	vmargin = ioc->margin_us * now->vrate;
1075 	vmin = now->vnow - vmargin;
1076 
1077 	if (last_period + max_period_delta < cur_period ||
1078 	    time_before64(vtime, vmin)) {
1079 		atomic64_add(vmin - vtime, &iocg->vtime);
1080 		atomic64_add(vmin - vtime, &iocg->done_vtime);
1081 		vtime = vmin;
1082 	}
1083 
1084 	/*
1085 	 * Activate, propagate weight and start period timer if not
1086 	 * running.  Reset hweight_gen to avoid accidental match from
1087 	 * wrapping.
1088 	 */
1089 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1090 	list_add(&iocg->active_list, &ioc->active_iocgs);
1091 	propagate_active_weight(iocg, iocg->weight,
1092 				iocg->last_inuse ?: iocg->weight);
1093 
1094 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1095 			last_period, cur_period, vtime);
1096 
1097 	iocg->last_vtime = vtime;
1098 
1099 	if (ioc->running == IOC_IDLE) {
1100 		ioc->running = IOC_RUNNING;
1101 		ioc_start_period(ioc, now);
1102 	}
1103 
1104 succeed_unlock:
1105 	spin_unlock_irq(&ioc->lock);
1106 	return true;
1107 
1108 fail_unlock:
1109 	spin_unlock_irq(&ioc->lock);
1110 	return false;
1111 }
1112 
1113 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1114 			int flags, void *key)
1115 {
1116 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1117 	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1118 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1119 
1120 	ctx->vbudget -= cost;
1121 
1122 	if (ctx->vbudget < 0)
1123 		return -1;
1124 
1125 	iocg_commit_bio(ctx->iocg, wait->bio, cost);
1126 
1127 	/*
1128 	 * autoremove_wake_function() removes the wait entry only when it
1129 	 * actually changed the task state.  We want the wait always
1130 	 * removed.  Remove explicitly and use default_wake_function().
1131 	 */
1132 	list_del_init(&wq_entry->entry);
1133 	wait->committed = true;
1134 
1135 	default_wake_function(wq_entry, mode, flags, key);
1136 	return 0;
1137 }
1138 
1139 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1140 {
1141 	struct ioc *ioc = iocg->ioc;
1142 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1143 	u64 margin_ns = (u64)(ioc->period_us *
1144 			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1145 	u64 vdebt, vshortage, expires, oexpires;
1146 	s64 vbudget;
1147 	u32 hw_inuse;
1148 
1149 	lockdep_assert_held(&iocg->waitq.lock);
1150 
1151 	current_hweight(iocg, NULL, &hw_inuse);
1152 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1153 
1154 	/* pay off debt */
1155 	vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1156 	if (vdebt && vbudget > 0) {
1157 		u64 delta = min_t(u64, vbudget, vdebt);
1158 		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1159 				    iocg->abs_vdebt);
1160 
1161 		atomic64_add(delta, &iocg->vtime);
1162 		atomic64_add(delta, &iocg->done_vtime);
1163 		iocg->abs_vdebt -= abs_delta;
1164 	}
1165 
1166 	/*
1167 	 * Wake up the ones which are due and see how much vtime we'll need
1168 	 * for the next one.
1169 	 */
1170 	ctx.hw_inuse = hw_inuse;
1171 	ctx.vbudget = vbudget - vdebt;
1172 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1173 	if (!waitqueue_active(&iocg->waitq))
1174 		return;
1175 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1176 		return;
1177 
1178 	/* determine next wakeup, add a quarter margin to guarantee chunking */
1179 	vshortage = -ctx.vbudget;
1180 	expires = now->now_ns +
1181 		DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1182 	expires += margin_ns / 4;
1183 
1184 	/* if already active and close enough, don't bother */
1185 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1186 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1187 	    abs(oexpires - expires) <= margin_ns / 4)
1188 		return;
1189 
1190 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1191 			       margin_ns / 4, HRTIMER_MODE_ABS);
1192 }
1193 
1194 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1195 {
1196 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1197 	struct ioc_now now;
1198 	unsigned long flags;
1199 
1200 	ioc_now(iocg->ioc, &now);
1201 
1202 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1203 	iocg_kick_waitq(iocg, &now);
1204 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1205 
1206 	return HRTIMER_NORESTART;
1207 }
1208 
1209 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1210 {
1211 	struct ioc *ioc = iocg->ioc;
1212 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1213 	u64 vtime = atomic64_read(&iocg->vtime);
1214 	u64 vmargin = ioc->margin_us * now->vrate;
1215 	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1216 	u64 expires, oexpires;
1217 	u32 hw_inuse;
1218 
1219 	lockdep_assert_held(&iocg->waitq.lock);
1220 
1221 	/* debt-adjust vtime */
1222 	current_hweight(iocg, NULL, &hw_inuse);
1223 	vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1224 
1225 	/*
1226 	 * Clear or maintain depending on the overage. Non-zero vdebt is what
1227 	 * guarantees that @iocg is online and future iocg_kick_delay() will
1228 	 * clear use_delay. Don't leave it on when there's no vdebt.
1229 	 */
1230 	if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1231 		blkcg_clear_delay(blkg);
1232 		return false;
1233 	}
1234 	if (!atomic_read(&blkg->use_delay) &&
1235 	    time_before_eq64(vtime, now->vnow + vmargin))
1236 		return false;
1237 
1238 	/* use delay */
1239 	if (cost) {
1240 		u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1241 						 now->vrate);
1242 		blkcg_add_delay(blkg, now->now_ns, cost_ns);
1243 	}
1244 	blkcg_use_delay(blkg);
1245 
1246 	expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1247 						   now->vrate) * NSEC_PER_USEC;
1248 
1249 	/* if already active and close enough, don't bother */
1250 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1251 	if (hrtimer_is_queued(&iocg->delay_timer) &&
1252 	    abs(oexpires - expires) <= margin_ns / 4)
1253 		return true;
1254 
1255 	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1256 			       margin_ns / 4, HRTIMER_MODE_ABS);
1257 	return true;
1258 }
1259 
1260 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1261 {
1262 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1263 	struct ioc_now now;
1264 	unsigned long flags;
1265 
1266 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1267 	ioc_now(iocg->ioc, &now);
1268 	iocg_kick_delay(iocg, &now, 0);
1269 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1270 
1271 	return HRTIMER_NORESTART;
1272 }
1273 
1274 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1275 {
1276 	u32 nr_met[2] = { };
1277 	u32 nr_missed[2] = { };
1278 	u64 rq_wait_ns = 0;
1279 	int cpu, rw;
1280 
1281 	for_each_online_cpu(cpu) {
1282 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1283 		u64 this_rq_wait_ns;
1284 
1285 		for (rw = READ; rw <= WRITE; rw++) {
1286 			u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1287 			u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1288 
1289 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1290 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1291 			stat->missed[rw].last_met = this_met;
1292 			stat->missed[rw].last_missed = this_missed;
1293 		}
1294 
1295 		this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1296 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1297 		stat->last_rq_wait_ns = this_rq_wait_ns;
1298 	}
1299 
1300 	for (rw = READ; rw <= WRITE; rw++) {
1301 		if (nr_met[rw] + nr_missed[rw])
1302 			missed_ppm_ar[rw] =
1303 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1304 						   nr_met[rw] + nr_missed[rw]);
1305 		else
1306 			missed_ppm_ar[rw] = 0;
1307 	}
1308 
1309 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1310 				   ioc->period_us * NSEC_PER_USEC);
1311 }
1312 
1313 /* was iocg idle this period? */
1314 static bool iocg_is_idle(struct ioc_gq *iocg)
1315 {
1316 	struct ioc *ioc = iocg->ioc;
1317 
1318 	/* did something get issued this period? */
1319 	if (atomic64_read(&iocg->active_period) ==
1320 	    atomic64_read(&ioc->cur_period))
1321 		return false;
1322 
1323 	/* is something in flight? */
1324 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1325 		return false;
1326 
1327 	return true;
1328 }
1329 
1330 /* returns usage with margin added if surplus is large enough */
1331 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1332 {
1333 	/* add margin */
1334 	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1335 	usage += SURPLUS_SCALE_ABS;
1336 
1337 	/* don't bother if the surplus is too small */
1338 	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1339 		return 0;
1340 
1341 	return usage;
1342 }
1343 
1344 static void ioc_timer_fn(struct timer_list *timer)
1345 {
1346 	struct ioc *ioc = container_of(timer, struct ioc, timer);
1347 	struct ioc_gq *iocg, *tiocg;
1348 	struct ioc_now now;
1349 	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1350 	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1351 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1352 	u32 missed_ppm[2], rq_wait_pct;
1353 	u64 period_vtime;
1354 	int prev_busy_level, i;
1355 
1356 	/* how were the latencies during the period? */
1357 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1358 
1359 	/* take care of active iocgs */
1360 	spin_lock_irq(&ioc->lock);
1361 
1362 	ioc_now(ioc, &now);
1363 
1364 	period_vtime = now.vnow - ioc->period_at_vtime;
1365 	if (WARN_ON_ONCE(!period_vtime)) {
1366 		spin_unlock_irq(&ioc->lock);
1367 		return;
1368 	}
1369 
1370 	/*
1371 	 * Waiters determine the sleep durations based on the vrate they
1372 	 * saw at the time of sleep.  If vrate has increased, some waiters
1373 	 * could be sleeping for too long.  Wake up tardy waiters which
1374 	 * should have woken up in the last period and expire idle iocgs.
1375 	 */
1376 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1377 		if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
1378 		    !iocg_is_idle(iocg))
1379 			continue;
1380 
1381 		spin_lock(&iocg->waitq.lock);
1382 
1383 		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1384 			/* might be oversleeping vtime / hweight changes, kick */
1385 			iocg_kick_waitq(iocg, &now);
1386 			iocg_kick_delay(iocg, &now, 0);
1387 		} else if (iocg_is_idle(iocg)) {
1388 			/* no waiter and idle, deactivate */
1389 			iocg->last_inuse = iocg->inuse;
1390 			__propagate_active_weight(iocg, 0, 0);
1391 			list_del_init(&iocg->active_list);
1392 		}
1393 
1394 		spin_unlock(&iocg->waitq.lock);
1395 	}
1396 	commit_active_weights(ioc);
1397 
1398 	/* calc usages and see whether some weights need to be moved around */
1399 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1400 		u64 vdone, vtime, vusage, vmargin, vmin;
1401 		u32 hw_active, hw_inuse, usage;
1402 
1403 		/*
1404 		 * Collect unused and wind vtime closer to vnow to prevent
1405 		 * iocgs from accumulating a large amount of budget.
1406 		 */
1407 		vdone = atomic64_read(&iocg->done_vtime);
1408 		vtime = atomic64_read(&iocg->vtime);
1409 		current_hweight(iocg, &hw_active, &hw_inuse);
1410 
1411 		/*
1412 		 * Latency QoS detection doesn't account for IOs which are
1413 		 * in-flight for longer than a period.  Detect them by
1414 		 * comparing vdone against period start.  If lagging behind
1415 		 * IOs from past periods, don't increase vrate.
1416 		 */
1417 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1418 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1419 		    time_after64(vtime, vdone) &&
1420 		    time_after64(vtime, now.vnow -
1421 				 MAX_LAGGING_PERIODS * period_vtime) &&
1422 		    time_before64(vdone, now.vnow - period_vtime))
1423 			nr_lagging++;
1424 
1425 		if (waitqueue_active(&iocg->waitq))
1426 			vusage = now.vnow - iocg->last_vtime;
1427 		else if (time_before64(iocg->last_vtime, vtime))
1428 			vusage = vtime - iocg->last_vtime;
1429 		else
1430 			vusage = 0;
1431 
1432 		iocg->last_vtime += vusage;
1433 		/*
1434 		 * Factor in in-flight vtime into vusage to avoid
1435 		 * high-latency completions appearing as idle.  This should
1436 		 * be done after the above ->last_time adjustment.
1437 		 */
1438 		vusage = max(vusage, vtime - vdone);
1439 
1440 		/* calculate hweight based usage ratio and record */
1441 		if (vusage) {
1442 			usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1443 						   period_vtime);
1444 			iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1445 			iocg->usages[iocg->usage_idx] = usage;
1446 		} else {
1447 			usage = 0;
1448 		}
1449 
1450 		/* see whether there's surplus vtime */
1451 		vmargin = ioc->margin_us * now.vrate;
1452 		vmin = now.vnow - vmargin;
1453 
1454 		iocg->has_surplus = false;
1455 
1456 		if (!waitqueue_active(&iocg->waitq) &&
1457 		    time_before64(vtime, vmin)) {
1458 			u64 delta = vmin - vtime;
1459 
1460 			/* throw away surplus vtime */
1461 			atomic64_add(delta, &iocg->vtime);
1462 			atomic64_add(delta, &iocg->done_vtime);
1463 			iocg->last_vtime += delta;
1464 			/* if usage is sufficiently low, maybe it can donate */
1465 			if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1466 				iocg->has_surplus = true;
1467 				nr_surpluses++;
1468 			}
1469 		} else if (hw_inuse < hw_active) {
1470 			u32 new_hwi, new_inuse;
1471 
1472 			/* was donating but might need to take back some */
1473 			if (waitqueue_active(&iocg->waitq)) {
1474 				new_hwi = hw_active;
1475 			} else {
1476 				new_hwi = max(hw_inuse,
1477 					      usage * SURPLUS_SCALE_PCT / 100 +
1478 					      SURPLUS_SCALE_ABS);
1479 			}
1480 
1481 			new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1482 					      hw_inuse);
1483 			new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1484 
1485 			if (new_inuse > iocg->inuse) {
1486 				TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1487 						iocg->inuse, new_inuse,
1488 						hw_inuse, new_hwi);
1489 				__propagate_active_weight(iocg, iocg->weight,
1490 							  new_inuse);
1491 			}
1492 		} else {
1493 			/* genuninely out of vtime */
1494 			nr_shortages++;
1495 		}
1496 	}
1497 
1498 	if (!nr_shortages || !nr_surpluses)
1499 		goto skip_surplus_transfers;
1500 
1501 	/* there are both shortages and surpluses, transfer surpluses */
1502 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1503 		u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1504 		int nr_valid = 0;
1505 
1506 		if (!iocg->has_surplus)
1507 			continue;
1508 
1509 		/* base the decision on max historical usage */
1510 		for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1511 			if (iocg->usages[i]) {
1512 				usage = max(usage, iocg->usages[i]);
1513 				nr_valid++;
1514 			}
1515 		}
1516 		if (nr_valid < MIN_VALID_USAGES)
1517 			continue;
1518 
1519 		current_hweight(iocg, &hw_active, &hw_inuse);
1520 		new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1521 		if (!new_hwi)
1522 			continue;
1523 
1524 		new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1525 					       hw_inuse);
1526 		if (new_inuse < iocg->inuse) {
1527 			TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1528 					iocg->inuse, new_inuse,
1529 					hw_inuse, new_hwi);
1530 			__propagate_active_weight(iocg, iocg->weight, new_inuse);
1531 		}
1532 	}
1533 skip_surplus_transfers:
1534 	commit_active_weights(ioc);
1535 
1536 	/*
1537 	 * If q is getting clogged or we're missing too much, we're issuing
1538 	 * too much IO and should lower vtime rate.  If we're not missing
1539 	 * and experiencing shortages but not surpluses, we're too stingy
1540 	 * and should increase vtime rate.
1541 	 */
1542 	prev_busy_level = ioc->busy_level;
1543 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1544 	    missed_ppm[READ] > ppm_rthr ||
1545 	    missed_ppm[WRITE] > ppm_wthr) {
1546 		ioc->busy_level = max(ioc->busy_level, 0);
1547 		ioc->busy_level++;
1548 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1549 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1550 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1551 		/* take action iff there is contention */
1552 		if (nr_shortages && !nr_lagging) {
1553 			ioc->busy_level = min(ioc->busy_level, 0);
1554 			/* redistribute surpluses first */
1555 			if (!nr_surpluses)
1556 				ioc->busy_level--;
1557 		}
1558 	} else {
1559 		ioc->busy_level = 0;
1560 	}
1561 
1562 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1563 
1564 	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1565 		u64 vrate = atomic64_read(&ioc->vtime_rate);
1566 		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1567 
1568 		/* rq_wait signal is always reliable, ignore user vrate_min */
1569 		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1570 			vrate_min = VRATE_MIN;
1571 
1572 		/*
1573 		 * If vrate is out of bounds, apply clamp gradually as the
1574 		 * bounds can change abruptly.  Otherwise, apply busy_level
1575 		 * based adjustment.
1576 		 */
1577 		if (vrate < vrate_min) {
1578 			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1579 					  100);
1580 			vrate = min(vrate, vrate_min);
1581 		} else if (vrate > vrate_max) {
1582 			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1583 					  100);
1584 			vrate = max(vrate, vrate_max);
1585 		} else {
1586 			int idx = min_t(int, abs(ioc->busy_level),
1587 					ARRAY_SIZE(vrate_adj_pct) - 1);
1588 			u32 adj_pct = vrate_adj_pct[idx];
1589 
1590 			if (ioc->busy_level > 0)
1591 				adj_pct = 100 - adj_pct;
1592 			else
1593 				adj_pct = 100 + adj_pct;
1594 
1595 			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1596 				      vrate_min, vrate_max);
1597 		}
1598 
1599 		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1600 					   nr_lagging, nr_shortages,
1601 					   nr_surpluses);
1602 
1603 		atomic64_set(&ioc->vtime_rate, vrate);
1604 		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1605 			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1606 	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1607 		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1608 					   missed_ppm, rq_wait_pct, nr_lagging,
1609 					   nr_shortages, nr_surpluses);
1610 	}
1611 
1612 	ioc_refresh_params(ioc, false);
1613 
1614 	/*
1615 	 * This period is done.  Move onto the next one.  If nothing's
1616 	 * going on with the device, stop the timer.
1617 	 */
1618 	atomic64_inc(&ioc->cur_period);
1619 
1620 	if (ioc->running != IOC_STOP) {
1621 		if (!list_empty(&ioc->active_iocgs)) {
1622 			ioc_start_period(ioc, &now);
1623 		} else {
1624 			ioc->busy_level = 0;
1625 			ioc->running = IOC_IDLE;
1626 		}
1627 	}
1628 
1629 	spin_unlock_irq(&ioc->lock);
1630 }
1631 
1632 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1633 				    bool is_merge, u64 *costp)
1634 {
1635 	struct ioc *ioc = iocg->ioc;
1636 	u64 coef_seqio, coef_randio, coef_page;
1637 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1638 	u64 seek_pages = 0;
1639 	u64 cost = 0;
1640 
1641 	switch (bio_op(bio)) {
1642 	case REQ_OP_READ:
1643 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
1644 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
1645 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
1646 		break;
1647 	case REQ_OP_WRITE:
1648 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
1649 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
1650 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
1651 		break;
1652 	default:
1653 		goto out;
1654 	}
1655 
1656 	if (iocg->cursor) {
1657 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1658 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1659 	}
1660 
1661 	if (!is_merge) {
1662 		if (seek_pages > LCOEF_RANDIO_PAGES) {
1663 			cost += coef_randio;
1664 		} else {
1665 			cost += coef_seqio;
1666 		}
1667 	}
1668 	cost += pages * coef_page;
1669 out:
1670 	*costp = cost;
1671 }
1672 
1673 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1674 {
1675 	u64 cost;
1676 
1677 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1678 	return cost;
1679 }
1680 
1681 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1682 {
1683 	struct blkcg_gq *blkg = bio->bi_blkg;
1684 	struct ioc *ioc = rqos_to_ioc(rqos);
1685 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
1686 	struct ioc_now now;
1687 	struct iocg_wait wait;
1688 	u32 hw_active, hw_inuse;
1689 	u64 abs_cost, cost, vtime;
1690 
1691 	/* bypass IOs if disabled or for root cgroup */
1692 	if (!ioc->enabled || !iocg->level)
1693 		return;
1694 
1695 	/* always activate so that even 0 cost IOs get protected to some level */
1696 	if (!iocg_activate(iocg, &now))
1697 		return;
1698 
1699 	/* calculate the absolute vtime cost */
1700 	abs_cost = calc_vtime_cost(bio, iocg, false);
1701 	if (!abs_cost)
1702 		return;
1703 
1704 	iocg->cursor = bio_end_sector(bio);
1705 
1706 	vtime = atomic64_read(&iocg->vtime);
1707 	current_hweight(iocg, &hw_active, &hw_inuse);
1708 
1709 	if (hw_inuse < hw_active &&
1710 	    time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1711 		TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1712 				iocg->inuse, iocg->weight, hw_inuse, hw_active);
1713 		spin_lock_irq(&ioc->lock);
1714 		propagate_active_weight(iocg, iocg->weight, iocg->weight);
1715 		spin_unlock_irq(&ioc->lock);
1716 		current_hweight(iocg, &hw_active, &hw_inuse);
1717 	}
1718 
1719 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1720 
1721 	/*
1722 	 * If no one's waiting and within budget, issue right away.  The
1723 	 * tests are racy but the races aren't systemic - we only miss once
1724 	 * in a while which is fine.
1725 	 */
1726 	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1727 	    time_before_eq64(vtime + cost, now.vnow)) {
1728 		iocg_commit_bio(iocg, bio, cost);
1729 		return;
1730 	}
1731 
1732 	/*
1733 	 * We activated above but w/o any synchronization. Deactivation is
1734 	 * synchronized with waitq.lock and we won't get deactivated as long
1735 	 * as we're waiting or has debt, so we're good if we're activated
1736 	 * here. In the unlikely case that we aren't, just issue the IO.
1737 	 */
1738 	spin_lock_irq(&iocg->waitq.lock);
1739 
1740 	if (unlikely(list_empty(&iocg->active_list))) {
1741 		spin_unlock_irq(&iocg->waitq.lock);
1742 		iocg_commit_bio(iocg, bio, cost);
1743 		return;
1744 	}
1745 
1746 	/*
1747 	 * We're over budget. If @bio has to be issued regardless, remember
1748 	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1749 	 * off the debt before waking more IOs.
1750 	 *
1751 	 * This way, the debt is continuously paid off each period with the
1752 	 * actual budget available to the cgroup. If we just wound vtime, we
1753 	 * would incorrectly use the current hw_inuse for the entire amount
1754 	 * which, for example, can lead to the cgroup staying blocked for a
1755 	 * long time even with substantially raised hw_inuse.
1756 	 *
1757 	 * An iocg with vdebt should stay online so that the timer can keep
1758 	 * deducting its vdebt and [de]activate use_delay mechanism
1759 	 * accordingly. We don't want to race against the timer trying to
1760 	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
1761 	 * penalizing the cgroup and its descendants.
1762 	 */
1763 	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1764 		iocg->abs_vdebt += abs_cost;
1765 		if (iocg_kick_delay(iocg, &now, cost))
1766 			blkcg_schedule_throttle(rqos->q,
1767 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1768 		spin_unlock_irq(&iocg->waitq.lock);
1769 		return;
1770 	}
1771 
1772 	/*
1773 	 * Append self to the waitq and schedule the wakeup timer if we're
1774 	 * the first waiter.  The timer duration is calculated based on the
1775 	 * current vrate.  vtime and hweight changes can make it too short
1776 	 * or too long.  Each wait entry records the absolute cost it's
1777 	 * waiting for to allow re-evaluation using a custom wait entry.
1778 	 *
1779 	 * If too short, the timer simply reschedules itself.  If too long,
1780 	 * the period timer will notice and trigger wakeups.
1781 	 *
1782 	 * All waiters are on iocg->waitq and the wait states are
1783 	 * synchronized using waitq.lock.
1784 	 */
1785 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1786 	wait.wait.private = current;
1787 	wait.bio = bio;
1788 	wait.abs_cost = abs_cost;
1789 	wait.committed = false;	/* will be set true by waker */
1790 
1791 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1792 	iocg_kick_waitq(iocg, &now);
1793 
1794 	spin_unlock_irq(&iocg->waitq.lock);
1795 
1796 	while (true) {
1797 		set_current_state(TASK_UNINTERRUPTIBLE);
1798 		if (wait.committed)
1799 			break;
1800 		io_schedule();
1801 	}
1802 
1803 	/* waker already committed us, proceed */
1804 	finish_wait(&iocg->waitq, &wait.wait);
1805 }
1806 
1807 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1808 			   struct bio *bio)
1809 {
1810 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1811 	struct ioc *ioc = iocg->ioc;
1812 	sector_t bio_end = bio_end_sector(bio);
1813 	struct ioc_now now;
1814 	u32 hw_inuse;
1815 	u64 abs_cost, cost;
1816 	unsigned long flags;
1817 
1818 	/* bypass if disabled or for root cgroup */
1819 	if (!ioc->enabled || !iocg->level)
1820 		return;
1821 
1822 	abs_cost = calc_vtime_cost(bio, iocg, true);
1823 	if (!abs_cost)
1824 		return;
1825 
1826 	ioc_now(ioc, &now);
1827 	current_hweight(iocg, NULL, &hw_inuse);
1828 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1829 
1830 	/* update cursor if backmerging into the request at the cursor */
1831 	if (blk_rq_pos(rq) < bio_end &&
1832 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1833 		iocg->cursor = bio_end;
1834 
1835 	/*
1836 	 * Charge if there's enough vtime budget and the existing request has
1837 	 * cost assigned.
1838 	 */
1839 	if (rq->bio && rq->bio->bi_iocost_cost &&
1840 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1841 		iocg_commit_bio(iocg, bio, cost);
1842 		return;
1843 	}
1844 
1845 	/*
1846 	 * Otherwise, account it as debt if @iocg is online, which it should
1847 	 * be for the vast majority of cases. See debt handling in
1848 	 * ioc_rqos_throttle() for details.
1849 	 */
1850 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1851 	if (likely(!list_empty(&iocg->active_list))) {
1852 		iocg->abs_vdebt += abs_cost;
1853 		iocg_kick_delay(iocg, &now, cost);
1854 	} else {
1855 		iocg_commit_bio(iocg, bio, cost);
1856 	}
1857 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1858 }
1859 
1860 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1861 {
1862 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1863 
1864 	if (iocg && bio->bi_iocost_cost)
1865 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1866 }
1867 
1868 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1869 {
1870 	struct ioc *ioc = rqos_to_ioc(rqos);
1871 	u64 on_q_ns, rq_wait_ns;
1872 	int pidx, rw;
1873 
1874 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1875 		return;
1876 
1877 	switch (req_op(rq) & REQ_OP_MASK) {
1878 	case REQ_OP_READ:
1879 		pidx = QOS_RLAT;
1880 		rw = READ;
1881 		break;
1882 	case REQ_OP_WRITE:
1883 		pidx = QOS_WLAT;
1884 		rw = WRITE;
1885 		break;
1886 	default:
1887 		return;
1888 	}
1889 
1890 	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1891 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1892 
1893 	if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1894 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1895 	else
1896 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1897 
1898 	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1899 }
1900 
1901 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1902 {
1903 	struct ioc *ioc = rqos_to_ioc(rqos);
1904 
1905 	spin_lock_irq(&ioc->lock);
1906 	ioc_refresh_params(ioc, false);
1907 	spin_unlock_irq(&ioc->lock);
1908 }
1909 
1910 static void ioc_rqos_exit(struct rq_qos *rqos)
1911 {
1912 	struct ioc *ioc = rqos_to_ioc(rqos);
1913 
1914 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1915 
1916 	spin_lock_irq(&ioc->lock);
1917 	ioc->running = IOC_STOP;
1918 	spin_unlock_irq(&ioc->lock);
1919 
1920 	del_timer_sync(&ioc->timer);
1921 	free_percpu(ioc->pcpu_stat);
1922 	kfree(ioc);
1923 }
1924 
1925 static struct rq_qos_ops ioc_rqos_ops = {
1926 	.throttle = ioc_rqos_throttle,
1927 	.merge = ioc_rqos_merge,
1928 	.done_bio = ioc_rqos_done_bio,
1929 	.done = ioc_rqos_done,
1930 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
1931 	.exit = ioc_rqos_exit,
1932 };
1933 
1934 static int blk_iocost_init(struct request_queue *q)
1935 {
1936 	struct ioc *ioc;
1937 	struct rq_qos *rqos;
1938 	int ret;
1939 
1940 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1941 	if (!ioc)
1942 		return -ENOMEM;
1943 
1944 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1945 	if (!ioc->pcpu_stat) {
1946 		kfree(ioc);
1947 		return -ENOMEM;
1948 	}
1949 
1950 	rqos = &ioc->rqos;
1951 	rqos->id = RQ_QOS_COST;
1952 	rqos->ops = &ioc_rqos_ops;
1953 	rqos->q = q;
1954 
1955 	spin_lock_init(&ioc->lock);
1956 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
1957 	INIT_LIST_HEAD(&ioc->active_iocgs);
1958 
1959 	ioc->running = IOC_IDLE;
1960 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1961 	seqcount_init(&ioc->period_seqcount);
1962 	ioc->period_at = ktime_to_us(ktime_get());
1963 	atomic64_set(&ioc->cur_period, 0);
1964 	atomic_set(&ioc->hweight_gen, 0);
1965 
1966 	spin_lock_irq(&ioc->lock);
1967 	ioc->autop_idx = AUTOP_INVALID;
1968 	ioc_refresh_params(ioc, true);
1969 	spin_unlock_irq(&ioc->lock);
1970 
1971 	rq_qos_add(q, rqos);
1972 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1973 	if (ret) {
1974 		rq_qos_del(q, rqos);
1975 		free_percpu(ioc->pcpu_stat);
1976 		kfree(ioc);
1977 		return ret;
1978 	}
1979 	return 0;
1980 }
1981 
1982 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
1983 {
1984 	struct ioc_cgrp *iocc;
1985 
1986 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
1987 	if (!iocc)
1988 		return NULL;
1989 
1990 	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
1991 	return &iocc->cpd;
1992 }
1993 
1994 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
1995 {
1996 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
1997 }
1998 
1999 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2000 					     struct blkcg *blkcg)
2001 {
2002 	int levels = blkcg->css.cgroup->level + 1;
2003 	struct ioc_gq *iocg;
2004 
2005 	iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
2006 			    gfp, q->node);
2007 	if (!iocg)
2008 		return NULL;
2009 
2010 	return &iocg->pd;
2011 }
2012 
2013 static void ioc_pd_init(struct blkg_policy_data *pd)
2014 {
2015 	struct ioc_gq *iocg = pd_to_iocg(pd);
2016 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2017 	struct ioc *ioc = q_to_ioc(blkg->q);
2018 	struct ioc_now now;
2019 	struct blkcg_gq *tblkg;
2020 	unsigned long flags;
2021 
2022 	ioc_now(ioc, &now);
2023 
2024 	iocg->ioc = ioc;
2025 	atomic64_set(&iocg->vtime, now.vnow);
2026 	atomic64_set(&iocg->done_vtime, now.vnow);
2027 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2028 	INIT_LIST_HEAD(&iocg->active_list);
2029 	iocg->hweight_active = HWEIGHT_WHOLE;
2030 	iocg->hweight_inuse = HWEIGHT_WHOLE;
2031 
2032 	init_waitqueue_head(&iocg->waitq);
2033 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2034 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2035 	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2036 	iocg->delay_timer.function = iocg_delay_timer_fn;
2037 
2038 	iocg->level = blkg->blkcg->css.cgroup->level;
2039 
2040 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2041 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2042 		iocg->ancestors[tiocg->level] = tiocg;
2043 	}
2044 
2045 	spin_lock_irqsave(&ioc->lock, flags);
2046 	weight_updated(iocg);
2047 	spin_unlock_irqrestore(&ioc->lock, flags);
2048 }
2049 
2050 static void ioc_pd_free(struct blkg_policy_data *pd)
2051 {
2052 	struct ioc_gq *iocg = pd_to_iocg(pd);
2053 	struct ioc *ioc = iocg->ioc;
2054 
2055 	if (ioc) {
2056 		spin_lock(&ioc->lock);
2057 		if (!list_empty(&iocg->active_list)) {
2058 			propagate_active_weight(iocg, 0, 0);
2059 			list_del_init(&iocg->active_list);
2060 		}
2061 		spin_unlock(&ioc->lock);
2062 
2063 		hrtimer_cancel(&iocg->waitq_timer);
2064 		hrtimer_cancel(&iocg->delay_timer);
2065 	}
2066 	kfree(iocg);
2067 }
2068 
2069 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2070 			     int off)
2071 {
2072 	const char *dname = blkg_dev_name(pd->blkg);
2073 	struct ioc_gq *iocg = pd_to_iocg(pd);
2074 
2075 	if (dname && iocg->cfg_weight)
2076 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2077 	return 0;
2078 }
2079 
2080 
2081 static int ioc_weight_show(struct seq_file *sf, void *v)
2082 {
2083 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2084 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2085 
2086 	seq_printf(sf, "default %u\n", iocc->dfl_weight);
2087 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2088 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2089 	return 0;
2090 }
2091 
2092 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2093 				size_t nbytes, loff_t off)
2094 {
2095 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
2096 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2097 	struct blkg_conf_ctx ctx;
2098 	struct ioc_gq *iocg;
2099 	u32 v;
2100 	int ret;
2101 
2102 	if (!strchr(buf, ':')) {
2103 		struct blkcg_gq *blkg;
2104 
2105 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2106 			return -EINVAL;
2107 
2108 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2109 			return -EINVAL;
2110 
2111 		spin_lock(&blkcg->lock);
2112 		iocc->dfl_weight = v;
2113 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2114 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
2115 
2116 			if (iocg) {
2117 				spin_lock_irq(&iocg->ioc->lock);
2118 				weight_updated(iocg);
2119 				spin_unlock_irq(&iocg->ioc->lock);
2120 			}
2121 		}
2122 		spin_unlock(&blkcg->lock);
2123 
2124 		return nbytes;
2125 	}
2126 
2127 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2128 	if (ret)
2129 		return ret;
2130 
2131 	iocg = blkg_to_iocg(ctx.blkg);
2132 
2133 	if (!strncmp(ctx.body, "default", 7)) {
2134 		v = 0;
2135 	} else {
2136 		if (!sscanf(ctx.body, "%u", &v))
2137 			goto einval;
2138 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2139 			goto einval;
2140 	}
2141 
2142 	spin_lock(&iocg->ioc->lock);
2143 	iocg->cfg_weight = v;
2144 	weight_updated(iocg);
2145 	spin_unlock(&iocg->ioc->lock);
2146 
2147 	blkg_conf_finish(&ctx);
2148 	return nbytes;
2149 
2150 einval:
2151 	blkg_conf_finish(&ctx);
2152 	return -EINVAL;
2153 }
2154 
2155 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2156 			  int off)
2157 {
2158 	const char *dname = blkg_dev_name(pd->blkg);
2159 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2160 
2161 	if (!dname)
2162 		return 0;
2163 
2164 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2165 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2166 		   ioc->params.qos[QOS_RPPM] / 10000,
2167 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2168 		   ioc->params.qos[QOS_RLAT],
2169 		   ioc->params.qos[QOS_WPPM] / 10000,
2170 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2171 		   ioc->params.qos[QOS_WLAT],
2172 		   ioc->params.qos[QOS_MIN] / 10000,
2173 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
2174 		   ioc->params.qos[QOS_MAX] / 10000,
2175 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
2176 	return 0;
2177 }
2178 
2179 static int ioc_qos_show(struct seq_file *sf, void *v)
2180 {
2181 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2182 
2183 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2184 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2185 	return 0;
2186 }
2187 
2188 static const match_table_t qos_ctrl_tokens = {
2189 	{ QOS_ENABLE,		"enable=%u"	},
2190 	{ QOS_CTRL,		"ctrl=%s"	},
2191 	{ NR_QOS_CTRL_PARAMS,	NULL		},
2192 };
2193 
2194 static const match_table_t qos_tokens = {
2195 	{ QOS_RPPM,		"rpct=%s"	},
2196 	{ QOS_RLAT,		"rlat=%u"	},
2197 	{ QOS_WPPM,		"wpct=%s"	},
2198 	{ QOS_WLAT,		"wlat=%u"	},
2199 	{ QOS_MIN,		"min=%s"	},
2200 	{ QOS_MAX,		"max=%s"	},
2201 	{ NR_QOS_PARAMS,	NULL		},
2202 };
2203 
2204 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2205 			     size_t nbytes, loff_t off)
2206 {
2207 	struct gendisk *disk;
2208 	struct ioc *ioc;
2209 	u32 qos[NR_QOS_PARAMS];
2210 	bool enable, user;
2211 	char *p;
2212 	int ret;
2213 
2214 	disk = blkcg_conf_get_disk(&input);
2215 	if (IS_ERR(disk))
2216 		return PTR_ERR(disk);
2217 
2218 	ioc = q_to_ioc(disk->queue);
2219 	if (!ioc) {
2220 		ret = blk_iocost_init(disk->queue);
2221 		if (ret)
2222 			goto err;
2223 		ioc = q_to_ioc(disk->queue);
2224 	}
2225 
2226 	spin_lock_irq(&ioc->lock);
2227 	memcpy(qos, ioc->params.qos, sizeof(qos));
2228 	enable = ioc->enabled;
2229 	user = ioc->user_qos_params;
2230 	spin_unlock_irq(&ioc->lock);
2231 
2232 	while ((p = strsep(&input, " \t\n"))) {
2233 		substring_t args[MAX_OPT_ARGS];
2234 		char buf[32];
2235 		int tok;
2236 		s64 v;
2237 
2238 		if (!*p)
2239 			continue;
2240 
2241 		switch (match_token(p, qos_ctrl_tokens, args)) {
2242 		case QOS_ENABLE:
2243 			match_u64(&args[0], &v);
2244 			enable = v;
2245 			continue;
2246 		case QOS_CTRL:
2247 			match_strlcpy(buf, &args[0], sizeof(buf));
2248 			if (!strcmp(buf, "auto"))
2249 				user = false;
2250 			else if (!strcmp(buf, "user"))
2251 				user = true;
2252 			else
2253 				goto einval;
2254 			continue;
2255 		}
2256 
2257 		tok = match_token(p, qos_tokens, args);
2258 		switch (tok) {
2259 		case QOS_RPPM:
2260 		case QOS_WPPM:
2261 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2262 			    sizeof(buf))
2263 				goto einval;
2264 			if (cgroup_parse_float(buf, 2, &v))
2265 				goto einval;
2266 			if (v < 0 || v > 10000)
2267 				goto einval;
2268 			qos[tok] = v * 100;
2269 			break;
2270 		case QOS_RLAT:
2271 		case QOS_WLAT:
2272 			if (match_u64(&args[0], &v))
2273 				goto einval;
2274 			qos[tok] = v;
2275 			break;
2276 		case QOS_MIN:
2277 		case QOS_MAX:
2278 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2279 			    sizeof(buf))
2280 				goto einval;
2281 			if (cgroup_parse_float(buf, 2, &v))
2282 				goto einval;
2283 			if (v < 0)
2284 				goto einval;
2285 			qos[tok] = clamp_t(s64, v * 100,
2286 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
2287 			break;
2288 		default:
2289 			goto einval;
2290 		}
2291 		user = true;
2292 	}
2293 
2294 	if (qos[QOS_MIN] > qos[QOS_MAX])
2295 		goto einval;
2296 
2297 	spin_lock_irq(&ioc->lock);
2298 
2299 	if (enable) {
2300 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2301 		ioc->enabled = true;
2302 	} else {
2303 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2304 		ioc->enabled = false;
2305 	}
2306 
2307 	if (user) {
2308 		memcpy(ioc->params.qos, qos, sizeof(qos));
2309 		ioc->user_qos_params = true;
2310 	} else {
2311 		ioc->user_qos_params = false;
2312 	}
2313 
2314 	ioc_refresh_params(ioc, true);
2315 	spin_unlock_irq(&ioc->lock);
2316 
2317 	put_disk_and_module(disk);
2318 	return nbytes;
2319 einval:
2320 	ret = -EINVAL;
2321 err:
2322 	put_disk_and_module(disk);
2323 	return ret;
2324 }
2325 
2326 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2327 				 struct blkg_policy_data *pd, int off)
2328 {
2329 	const char *dname = blkg_dev_name(pd->blkg);
2330 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2331 	u64 *u = ioc->params.i_lcoefs;
2332 
2333 	if (!dname)
2334 		return 0;
2335 
2336 	seq_printf(sf, "%s ctrl=%s model=linear "
2337 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2338 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2339 		   dname, ioc->user_cost_model ? "user" : "auto",
2340 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2341 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2342 	return 0;
2343 }
2344 
2345 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2346 {
2347 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2348 
2349 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2350 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2351 	return 0;
2352 }
2353 
2354 static const match_table_t cost_ctrl_tokens = {
2355 	{ COST_CTRL,		"ctrl=%s"	},
2356 	{ COST_MODEL,		"model=%s"	},
2357 	{ NR_COST_CTRL_PARAMS,	NULL		},
2358 };
2359 
2360 static const match_table_t i_lcoef_tokens = {
2361 	{ I_LCOEF_RBPS,		"rbps=%u"	},
2362 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
2363 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
2364 	{ I_LCOEF_WBPS,		"wbps=%u"	},
2365 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
2366 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
2367 	{ NR_I_LCOEFS,		NULL		},
2368 };
2369 
2370 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2371 				    size_t nbytes, loff_t off)
2372 {
2373 	struct gendisk *disk;
2374 	struct ioc *ioc;
2375 	u64 u[NR_I_LCOEFS];
2376 	bool user;
2377 	char *p;
2378 	int ret;
2379 
2380 	disk = blkcg_conf_get_disk(&input);
2381 	if (IS_ERR(disk))
2382 		return PTR_ERR(disk);
2383 
2384 	ioc = q_to_ioc(disk->queue);
2385 	if (!ioc) {
2386 		ret = blk_iocost_init(disk->queue);
2387 		if (ret)
2388 			goto err;
2389 		ioc = q_to_ioc(disk->queue);
2390 	}
2391 
2392 	spin_lock_irq(&ioc->lock);
2393 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2394 	user = ioc->user_cost_model;
2395 	spin_unlock_irq(&ioc->lock);
2396 
2397 	while ((p = strsep(&input, " \t\n"))) {
2398 		substring_t args[MAX_OPT_ARGS];
2399 		char buf[32];
2400 		int tok;
2401 		u64 v;
2402 
2403 		if (!*p)
2404 			continue;
2405 
2406 		switch (match_token(p, cost_ctrl_tokens, args)) {
2407 		case COST_CTRL:
2408 			match_strlcpy(buf, &args[0], sizeof(buf));
2409 			if (!strcmp(buf, "auto"))
2410 				user = false;
2411 			else if (!strcmp(buf, "user"))
2412 				user = true;
2413 			else
2414 				goto einval;
2415 			continue;
2416 		case COST_MODEL:
2417 			match_strlcpy(buf, &args[0], sizeof(buf));
2418 			if (strcmp(buf, "linear"))
2419 				goto einval;
2420 			continue;
2421 		}
2422 
2423 		tok = match_token(p, i_lcoef_tokens, args);
2424 		if (tok == NR_I_LCOEFS)
2425 			goto einval;
2426 		if (match_u64(&args[0], &v))
2427 			goto einval;
2428 		u[tok] = v;
2429 		user = true;
2430 	}
2431 
2432 	spin_lock_irq(&ioc->lock);
2433 	if (user) {
2434 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2435 		ioc->user_cost_model = true;
2436 	} else {
2437 		ioc->user_cost_model = false;
2438 	}
2439 	ioc_refresh_params(ioc, true);
2440 	spin_unlock_irq(&ioc->lock);
2441 
2442 	put_disk_and_module(disk);
2443 	return nbytes;
2444 
2445 einval:
2446 	ret = -EINVAL;
2447 err:
2448 	put_disk_and_module(disk);
2449 	return ret;
2450 }
2451 
2452 static struct cftype ioc_files[] = {
2453 	{
2454 		.name = "weight",
2455 		.flags = CFTYPE_NOT_ON_ROOT,
2456 		.seq_show = ioc_weight_show,
2457 		.write = ioc_weight_write,
2458 	},
2459 	{
2460 		.name = "cost.qos",
2461 		.flags = CFTYPE_ONLY_ON_ROOT,
2462 		.seq_show = ioc_qos_show,
2463 		.write = ioc_qos_write,
2464 	},
2465 	{
2466 		.name = "cost.model",
2467 		.flags = CFTYPE_ONLY_ON_ROOT,
2468 		.seq_show = ioc_cost_model_show,
2469 		.write = ioc_cost_model_write,
2470 	},
2471 	{}
2472 };
2473 
2474 static struct blkcg_policy blkcg_policy_iocost = {
2475 	.dfl_cftypes	= ioc_files,
2476 	.cpd_alloc_fn	= ioc_cpd_alloc,
2477 	.cpd_free_fn	= ioc_cpd_free,
2478 	.pd_alloc_fn	= ioc_pd_alloc,
2479 	.pd_init_fn	= ioc_pd_init,
2480 	.pd_free_fn	= ioc_pd_free,
2481 };
2482 
2483 static int __init ioc_init(void)
2484 {
2485 	return blkcg_policy_register(&blkcg_policy_iocost);
2486 }
2487 
2488 static void __exit ioc_exit(void)
2489 {
2490 	return blkcg_policy_unregister(&blkcg_policy_iocost);
2491 }
2492 
2493 module_init(ioc_init);
2494 module_exit(ioc_exit);
2495