xref: /linux/block/blk-iocost.c (revision 7954c92ede882b0dfd52a5db90291a4151b44c1a)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * parameters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * 2. Control Strategy
50  *
51  * The device virtual time (vtime) is used as the primary control metric.
52  * The control strategy is composed of the following three parts.
53  *
54  * 2-1. Vtime Distribution
55  *
56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
57  * calculated.  Please consider the following hierarchy where the numbers
58  * inside parentheses denote the configured weights.
59  *
60  *           root
61  *         /       \
62  *      A (w:100)  B (w:300)
63  *      /       \
64  *  A0 (w:100)  A1 (w:100)
65  *
66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69  * 12.5% each.  The distribution mechanism only cares about these flattened
70  * shares.  They're called hweights (hierarchical weights) and always add
71  * upto 1 (WEIGHT_ONE).
72  *
73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75  * against the device vtime - an IO which takes 10ms on the underlying
76  * device is considered to take 80ms on A0.
77  *
78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
80  * the vtime consumed by past IOs and can issue a new IO if doing so
81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
82  * suspended until the vtime has progressed enough to cover it.
83  *
84  * 2-2. Vrate Adjustment
85  *
86  * It's unrealistic to expect the cost model to be perfect.  There are too
87  * many devices and even on the same device the overall performance
88  * fluctuates depending on numerous factors such as IO mixture and device
89  * internal garbage collection.  The controller needs to adapt dynamically.
90  *
91  * This is achieved by adjusting the overall IO rate according to how busy
92  * the device is.  If the device becomes overloaded, we're sending down too
93  * many IOs and should generally slow down.  If there are waiting issuers
94  * but the device isn't saturated, we're issuing too few and should
95  * generally speed up.
96  *
97  * To slow down, we lower the vrate - the rate at which the device vtime
98  * passes compared to the wall clock.  For example, if the vtime is running
99  * at the vrate of 75%, all cgroups added up would only be able to issue
100  * 750ms worth of IOs per second, and vice-versa for speeding up.
101  *
102  * Device business is determined using two criteria - rq wait and
103  * completion latencies.
104  *
105  * When a device gets saturated, the on-device and then the request queues
106  * fill up and a bio which is ready to be issued has to wait for a request
107  * to become available.  When this delay becomes noticeable, it's a clear
108  * indication that the device is saturated and we lower the vrate.  This
109  * saturation signal is fairly conservative as it only triggers when both
110  * hardware and software queues are filled up, and is used as the default
111  * busy signal.
112  *
113  * As devices can have deep queues and be unfair in how the queued commands
114  * are executed, solely depending on rq wait may not result in satisfactory
115  * control quality.  For a better control quality, completion latency QoS
116  * parameters can be configured so that the device is considered saturated
117  * if N'th percentile completion latency rises above the set point.
118  *
119  * The completion latency requirements are a function of both the
120  * underlying device characteristics and the desired IO latency quality of
121  * service.  There is an inherent trade-off - the tighter the latency QoS,
122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
123  * and can be set through /sys/fs/cgroup/io.cost.qos.
124  *
125  * 2-3. Work Conservation
126  *
127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
128  * periodically while B is sending out enough parallel IOs to saturate the
129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
130  * cost per second, i.e., 10% of the device capacity.  The naive
131  * distribution of half and half would lead to 60% utilization of the
132  * device, a significant reduction in the total amount of work done
133  * compared to free-for-all competition.  This is too high a cost to pay
134  * for IO control.
135  *
136  * To conserve the total amount of work done, we keep track of how much
137  * each active cgroup is actually using and yield part of its weight if
138  * there are other cgroups which can make use of it.  In the above case,
139  * A's weight will be lowered so that it hovers above the actual usage and
140  * B would be able to use the rest.
141  *
142  * As we don't want to penalize a cgroup for donating its weight, the
143  * surplus weight adjustment factors in a margin and has an immediate
144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
145  *
146  * Note that adjusting down surplus weights has the same effects as
147  * accelerating vtime for other cgroups and work conservation can also be
148  * implemented by adjusting vrate dynamically.  However, squaring who can
149  * donate and should take back how much requires hweight propagations
150  * anyway making it easier to implement and understand as a separate
151  * mechanism.
152  *
153  * 3. Monitoring
154  *
155  * Instead of debugfs or other clumsy monitoring mechanisms, this
156  * controller uses a drgn based monitoring script -
157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
158  * https://github.com/osandov/drgn.  The output looks like the following.
159  *
160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
161  *                 active      weight      hweight% inflt% dbt  delay usages%
162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
164  *
165  * - per	: Timer period
166  * - cur_per	: Internal wall and device vtime clock
167  * - vrate	: Device virtual time rate against wall clock
168  * - weight	: Surplus-adjusted and configured weights
169  * - hweight	: Surplus-adjusted and configured hierarchical weights
170  * - inflt	: The percentage of in-flight IO cost at the end of last period
171  * - del_ms	: Deferred issuer delay induction level and duration
172  * - usages	: Usage history
173  */
174 
175 #include <linux/kernel.h>
176 #include <linux/module.h>
177 #include <linux/timer.h>
178 #include <linux/time64.h>
179 #include <linux/parser.h>
180 #include <linux/sched/signal.h>
181 #include <asm/local.h>
182 #include <asm/local64.h>
183 #include "blk-rq-qos.h"
184 #include "blk-stat.h"
185 #include "blk-wbt.h"
186 #include "blk-cgroup.h"
187 
188 #ifdef CONFIG_TRACEPOINTS
189 
190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191 #define TRACE_IOCG_PATH_LEN 1024
192 static DEFINE_SPINLOCK(trace_iocg_path_lock);
193 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194 
195 #define TRACE_IOCG_PATH(type, iocg, ...)					\
196 	do {									\
197 		unsigned long flags;						\
198 		if (trace_iocost_##type##_enabled()) {				\
199 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
200 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
201 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
202 			trace_iocost_##type(iocg, trace_iocg_path,		\
203 					      ##__VA_ARGS__);			\
204 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
205 		}								\
206 	} while (0)
207 
208 #else	/* CONFIG_TRACE_POINTS */
209 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
210 #endif	/* CONFIG_TRACE_POINTS */
211 
212 enum {
213 	MILLION			= 1000000,
214 
215 	/* timer period is calculated from latency requirements, bound it */
216 	MIN_PERIOD		= USEC_PER_MSEC,
217 	MAX_PERIOD		= USEC_PER_SEC,
218 
219 	/*
220 	 * iocg->vtime is targeted at 50% behind the device vtime, which
221 	 * serves as its IO credit buffer.  Surplus weight adjustment is
222 	 * immediately canceled if the vtime margin runs below 10%.
223 	 */
224 	MARGIN_MIN_PCT		= 10,
225 	MARGIN_LOW_PCT		= 20,
226 	MARGIN_TARGET_PCT	= 50,
227 
228 	INUSE_ADJ_STEP_PCT	= 25,
229 
230 	/* Have some play in timer operations */
231 	TIMER_SLACK_PCT		= 1,
232 
233 	/* 1/64k is granular enough and can easily be handled w/ u32 */
234 	WEIGHT_ONE		= 1 << 16,
235 };
236 
237 enum {
238 	/*
239 	 * As vtime is used to calculate the cost of each IO, it needs to
240 	 * be fairly high precision.  For example, it should be able to
241 	 * represent the cost of a single page worth of discard with
242 	 * suffificient accuracy.  At the same time, it should be able to
243 	 * represent reasonably long enough durations to be useful and
244 	 * convenient during operation.
245 	 *
246 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
247 	 * granularity and days of wrap-around time even at extreme vrates.
248 	 */
249 	VTIME_PER_SEC_SHIFT	= 37,
250 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
251 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
252 	VTIME_PER_NSEC		= VTIME_PER_SEC / NSEC_PER_SEC,
253 
254 	/* bound vrate adjustments within two orders of magnitude */
255 	VRATE_MIN_PPM		= 10000,	/* 1% */
256 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
257 
258 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
259 	VRATE_CLAMP_ADJ_PCT	= 4,
260 
261 	/* switch iff the conditions are met for longer than this */
262 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
263 };
264 
265 enum {
266 	/* if IOs end up waiting for requests, issue less */
267 	RQ_WAIT_BUSY_PCT	= 5,
268 
269 	/* unbusy hysterisis */
270 	UNBUSY_THR_PCT		= 75,
271 
272 	/*
273 	 * The effect of delay is indirect and non-linear and a huge amount of
274 	 * future debt can accumulate abruptly while unthrottled. Linearly scale
275 	 * up delay as debt is going up and then let it decay exponentially.
276 	 * This gives us quick ramp ups while delay is accumulating and long
277 	 * tails which can help reducing the frequency of debt explosions on
278 	 * unthrottle. The parameters are experimentally determined.
279 	 *
280 	 * The delay mechanism provides adequate protection and behavior in many
281 	 * cases. However, this is far from ideal and falls shorts on both
282 	 * fronts. The debtors are often throttled too harshly costing a
283 	 * significant level of fairness and possibly total work while the
284 	 * protection against their impacts on the system can be choppy and
285 	 * unreliable.
286 	 *
287 	 * The shortcoming primarily stems from the fact that, unlike for page
288 	 * cache, the kernel doesn't have well-defined back-pressure propagation
289 	 * mechanism and policies for anonymous memory. Fully addressing this
290 	 * issue will likely require substantial improvements in the area.
291 	 */
292 	MIN_DELAY_THR_PCT	= 500,
293 	MAX_DELAY_THR_PCT	= 25000,
294 	MIN_DELAY		= 250,
295 	MAX_DELAY		= 250 * USEC_PER_MSEC,
296 
297 	/* halve debts if avg usage over 100ms is under 50% */
298 	DFGV_USAGE_PCT		= 50,
299 	DFGV_PERIOD		= 100 * USEC_PER_MSEC,
300 
301 	/* don't let cmds which take a very long time pin lagging for too long */
302 	MAX_LAGGING_PERIODS	= 10,
303 
304 	/*
305 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
306 	 * size-proportional components of cost calculation in closer
307 	 * numbers of digits to per-IO cost components.
308 	 */
309 	IOC_PAGE_SHIFT		= 12,
310 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
311 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
312 
313 	/* if apart further than 16M, consider randio for linear model */
314 	LCOEF_RANDIO_PAGES	= 4096,
315 };
316 
317 enum ioc_running {
318 	IOC_IDLE,
319 	IOC_RUNNING,
320 	IOC_STOP,
321 };
322 
323 /* io.cost.qos controls including per-dev enable of the whole controller */
324 enum {
325 	QOS_ENABLE,
326 	QOS_CTRL,
327 	NR_QOS_CTRL_PARAMS,
328 };
329 
330 /* io.cost.qos params */
331 enum {
332 	QOS_RPPM,
333 	QOS_RLAT,
334 	QOS_WPPM,
335 	QOS_WLAT,
336 	QOS_MIN,
337 	QOS_MAX,
338 	NR_QOS_PARAMS,
339 };
340 
341 /* io.cost.model controls */
342 enum {
343 	COST_CTRL,
344 	COST_MODEL,
345 	NR_COST_CTRL_PARAMS,
346 };
347 
348 /* builtin linear cost model coefficients */
349 enum {
350 	I_LCOEF_RBPS,
351 	I_LCOEF_RSEQIOPS,
352 	I_LCOEF_RRANDIOPS,
353 	I_LCOEF_WBPS,
354 	I_LCOEF_WSEQIOPS,
355 	I_LCOEF_WRANDIOPS,
356 	NR_I_LCOEFS,
357 };
358 
359 enum {
360 	LCOEF_RPAGE,
361 	LCOEF_RSEQIO,
362 	LCOEF_RRANDIO,
363 	LCOEF_WPAGE,
364 	LCOEF_WSEQIO,
365 	LCOEF_WRANDIO,
366 	NR_LCOEFS,
367 };
368 
369 enum {
370 	AUTOP_INVALID,
371 	AUTOP_HDD,
372 	AUTOP_SSD_QD1,
373 	AUTOP_SSD_DFL,
374 	AUTOP_SSD_FAST,
375 };
376 
377 struct ioc_params {
378 	u32				qos[NR_QOS_PARAMS];
379 	u64				i_lcoefs[NR_I_LCOEFS];
380 	u64				lcoefs[NR_LCOEFS];
381 	u32				too_fast_vrate_pct;
382 	u32				too_slow_vrate_pct;
383 };
384 
385 struct ioc_margins {
386 	s64				min;
387 	s64				low;
388 	s64				target;
389 };
390 
391 struct ioc_missed {
392 	local_t				nr_met;
393 	local_t				nr_missed;
394 	u32				last_met;
395 	u32				last_missed;
396 };
397 
398 struct ioc_pcpu_stat {
399 	struct ioc_missed		missed[2];
400 
401 	local64_t			rq_wait_ns;
402 	u64				last_rq_wait_ns;
403 };
404 
405 /* per device */
406 struct ioc {
407 	struct rq_qos			rqos;
408 
409 	bool				enabled;
410 
411 	struct ioc_params		params;
412 	struct ioc_margins		margins;
413 	u32				period_us;
414 	u32				timer_slack_ns;
415 	u64				vrate_min;
416 	u64				vrate_max;
417 
418 	spinlock_t			lock;
419 	struct timer_list		timer;
420 	struct list_head		active_iocgs;	/* active cgroups */
421 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
422 
423 	enum ioc_running		running;
424 	atomic64_t			vtime_rate;
425 	u64				vtime_base_rate;
426 	s64				vtime_err;
427 
428 	seqcount_spinlock_t		period_seqcount;
429 	u64				period_at;	/* wallclock starttime */
430 	u64				period_at_vtime; /* vtime starttime */
431 
432 	atomic64_t			cur_period;	/* inc'd each period */
433 	int				busy_level;	/* saturation history */
434 
435 	bool				weights_updated;
436 	atomic_t			hweight_gen;	/* for lazy hweights */
437 
438 	/* debt forgivness */
439 	u64				dfgv_period_at;
440 	u64				dfgv_period_rem;
441 	u64				dfgv_usage_us_sum;
442 
443 	u64				autop_too_fast_at;
444 	u64				autop_too_slow_at;
445 	int				autop_idx;
446 	bool				user_qos_params:1;
447 	bool				user_cost_model:1;
448 };
449 
450 struct iocg_pcpu_stat {
451 	local64_t			abs_vusage;
452 };
453 
454 struct iocg_stat {
455 	u64				usage_us;
456 	u64				wait_us;
457 	u64				indebt_us;
458 	u64				indelay_us;
459 };
460 
461 /* per device-cgroup pair */
462 struct ioc_gq {
463 	struct blkg_policy_data		pd;
464 	struct ioc			*ioc;
465 
466 	/*
467 	 * A iocg can get its weight from two sources - an explicit
468 	 * per-device-cgroup configuration or the default weight of the
469 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
470 	 * configuration.  `weight` is the effective considering both
471 	 * sources.
472 	 *
473 	 * When an idle cgroup becomes active its `active` goes from 0 to
474 	 * `weight`.  `inuse` is the surplus adjusted active weight.
475 	 * `active` and `inuse` are used to calculate `hweight_active` and
476 	 * `hweight_inuse`.
477 	 *
478 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
479 	 * surplus adjustments.
480 	 *
481 	 * `inuse` may be adjusted dynamically during period. `saved_*` are used
482 	 * to determine and track adjustments.
483 	 */
484 	u32				cfg_weight;
485 	u32				weight;
486 	u32				active;
487 	u32				inuse;
488 
489 	u32				last_inuse;
490 	s64				saved_margin;
491 
492 	sector_t			cursor;		/* to detect randio */
493 
494 	/*
495 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
496 	 * issued.  If lagging behind device vtime, the delta represents
497 	 * the currently available IO budget.  If running ahead, the
498 	 * overage.
499 	 *
500 	 * `vtime_done` is the same but progressed on completion rather
501 	 * than issue.  The delta behind `vtime` represents the cost of
502 	 * currently in-flight IOs.
503 	 */
504 	atomic64_t			vtime;
505 	atomic64_t			done_vtime;
506 	u64				abs_vdebt;
507 
508 	/* current delay in effect and when it started */
509 	u64				delay;
510 	u64				delay_at;
511 
512 	/*
513 	 * The period this iocg was last active in.  Used for deactivation
514 	 * and invalidating `vtime`.
515 	 */
516 	atomic64_t			active_period;
517 	struct list_head		active_list;
518 
519 	/* see __propagate_weights() and current_hweight() for details */
520 	u64				child_active_sum;
521 	u64				child_inuse_sum;
522 	u64				child_adjusted_sum;
523 	int				hweight_gen;
524 	u32				hweight_active;
525 	u32				hweight_inuse;
526 	u32				hweight_donating;
527 	u32				hweight_after_donation;
528 
529 	struct list_head		walk_list;
530 	struct list_head		surplus_list;
531 
532 	struct wait_queue_head		waitq;
533 	struct hrtimer			waitq_timer;
534 
535 	/* timestamp at the latest activation */
536 	u64				activated_at;
537 
538 	/* statistics */
539 	struct iocg_pcpu_stat __percpu	*pcpu_stat;
540 	struct iocg_stat		stat;
541 	struct iocg_stat		last_stat;
542 	u64				last_stat_abs_vusage;
543 	u64				usage_delta_us;
544 	u64				wait_since;
545 	u64				indebt_since;
546 	u64				indelay_since;
547 
548 	/* this iocg's depth in the hierarchy and ancestors including self */
549 	int				level;
550 	struct ioc_gq			*ancestors[];
551 };
552 
553 /* per cgroup */
554 struct ioc_cgrp {
555 	struct blkcg_policy_data	cpd;
556 	unsigned int			dfl_weight;
557 };
558 
559 struct ioc_now {
560 	u64				now_ns;
561 	u64				now;
562 	u64				vnow;
563 };
564 
565 struct iocg_wait {
566 	struct wait_queue_entry		wait;
567 	struct bio			*bio;
568 	u64				abs_cost;
569 	bool				committed;
570 };
571 
572 struct iocg_wake_ctx {
573 	struct ioc_gq			*iocg;
574 	u32				hw_inuse;
575 	s64				vbudget;
576 };
577 
578 static const struct ioc_params autop[] = {
579 	[AUTOP_HDD] = {
580 		.qos				= {
581 			[QOS_RLAT]		=        250000, /* 250ms */
582 			[QOS_WLAT]		=        250000,
583 			[QOS_MIN]		= VRATE_MIN_PPM,
584 			[QOS_MAX]		= VRATE_MAX_PPM,
585 		},
586 		.i_lcoefs			= {
587 			[I_LCOEF_RBPS]		=     174019176,
588 			[I_LCOEF_RSEQIOPS]	=         41708,
589 			[I_LCOEF_RRANDIOPS]	=           370,
590 			[I_LCOEF_WBPS]		=     178075866,
591 			[I_LCOEF_WSEQIOPS]	=         42705,
592 			[I_LCOEF_WRANDIOPS]	=           378,
593 		},
594 	},
595 	[AUTOP_SSD_QD1] = {
596 		.qos				= {
597 			[QOS_RLAT]		=         25000, /* 25ms */
598 			[QOS_WLAT]		=         25000,
599 			[QOS_MIN]		= VRATE_MIN_PPM,
600 			[QOS_MAX]		= VRATE_MAX_PPM,
601 		},
602 		.i_lcoefs			= {
603 			[I_LCOEF_RBPS]		=     245855193,
604 			[I_LCOEF_RSEQIOPS]	=         61575,
605 			[I_LCOEF_RRANDIOPS]	=          6946,
606 			[I_LCOEF_WBPS]		=     141365009,
607 			[I_LCOEF_WSEQIOPS]	=         33716,
608 			[I_LCOEF_WRANDIOPS]	=         26796,
609 		},
610 	},
611 	[AUTOP_SSD_DFL] = {
612 		.qos				= {
613 			[QOS_RLAT]		=         25000, /* 25ms */
614 			[QOS_WLAT]		=         25000,
615 			[QOS_MIN]		= VRATE_MIN_PPM,
616 			[QOS_MAX]		= VRATE_MAX_PPM,
617 		},
618 		.i_lcoefs			= {
619 			[I_LCOEF_RBPS]		=     488636629,
620 			[I_LCOEF_RSEQIOPS]	=          8932,
621 			[I_LCOEF_RRANDIOPS]	=          8518,
622 			[I_LCOEF_WBPS]		=     427891549,
623 			[I_LCOEF_WSEQIOPS]	=         28755,
624 			[I_LCOEF_WRANDIOPS]	=         21940,
625 		},
626 		.too_fast_vrate_pct		=           500,
627 	},
628 	[AUTOP_SSD_FAST] = {
629 		.qos				= {
630 			[QOS_RLAT]		=          5000, /* 5ms */
631 			[QOS_WLAT]		=          5000,
632 			[QOS_MIN]		= VRATE_MIN_PPM,
633 			[QOS_MAX]		= VRATE_MAX_PPM,
634 		},
635 		.i_lcoefs			= {
636 			[I_LCOEF_RBPS]		=    3102524156LLU,
637 			[I_LCOEF_RSEQIOPS]	=        724816,
638 			[I_LCOEF_RRANDIOPS]	=        778122,
639 			[I_LCOEF_WBPS]		=    1742780862LLU,
640 			[I_LCOEF_WSEQIOPS]	=        425702,
641 			[I_LCOEF_WRANDIOPS]	=	 443193,
642 		},
643 		.too_slow_vrate_pct		=            10,
644 	},
645 };
646 
647 /*
648  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
649  * vtime credit shortage and down on device saturation.
650  */
651 static u32 vrate_adj_pct[] =
652 	{ 0, 0, 0, 0,
653 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
655 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
656 
657 static struct blkcg_policy blkcg_policy_iocost;
658 
659 /* accessors and helpers */
660 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
661 {
662 	return container_of(rqos, struct ioc, rqos);
663 }
664 
665 static struct ioc *q_to_ioc(struct request_queue *q)
666 {
667 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
668 }
669 
670 static const char __maybe_unused *ioc_name(struct ioc *ioc)
671 {
672 	struct gendisk *disk = ioc->rqos.disk;
673 
674 	if (!disk)
675 		return "<unknown>";
676 	return disk->disk_name;
677 }
678 
679 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
680 {
681 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
682 }
683 
684 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
685 {
686 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
687 }
688 
689 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
690 {
691 	return pd_to_blkg(&iocg->pd);
692 }
693 
694 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
695 {
696 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
697 			    struct ioc_cgrp, cpd);
698 }
699 
700 /*
701  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
702  * weight, the more expensive each IO.  Must round up.
703  */
704 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
705 {
706 	return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
707 }
708 
709 /*
710  * The inverse of abs_cost_to_cost().  Must round up.
711  */
712 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
713 {
714 	return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
715 }
716 
717 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
718 			    u64 abs_cost, u64 cost)
719 {
720 	struct iocg_pcpu_stat *gcs;
721 
722 	bio->bi_iocost_cost = cost;
723 	atomic64_add(cost, &iocg->vtime);
724 
725 	gcs = get_cpu_ptr(iocg->pcpu_stat);
726 	local64_add(abs_cost, &gcs->abs_vusage);
727 	put_cpu_ptr(gcs);
728 }
729 
730 static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
731 {
732 	if (lock_ioc) {
733 		spin_lock_irqsave(&iocg->ioc->lock, *flags);
734 		spin_lock(&iocg->waitq.lock);
735 	} else {
736 		spin_lock_irqsave(&iocg->waitq.lock, *flags);
737 	}
738 }
739 
740 static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
741 {
742 	if (unlock_ioc) {
743 		spin_unlock(&iocg->waitq.lock);
744 		spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
745 	} else {
746 		spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
747 	}
748 }
749 
750 #define CREATE_TRACE_POINTS
751 #include <trace/events/iocost.h>
752 
753 static void ioc_refresh_margins(struct ioc *ioc)
754 {
755 	struct ioc_margins *margins = &ioc->margins;
756 	u32 period_us = ioc->period_us;
757 	u64 vrate = ioc->vtime_base_rate;
758 
759 	margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
760 	margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
761 	margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
762 }
763 
764 /* latency Qos params changed, update period_us and all the dependent params */
765 static void ioc_refresh_period_us(struct ioc *ioc)
766 {
767 	u32 ppm, lat, multi, period_us;
768 
769 	lockdep_assert_held(&ioc->lock);
770 
771 	/* pick the higher latency target */
772 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
773 		ppm = ioc->params.qos[QOS_RPPM];
774 		lat = ioc->params.qos[QOS_RLAT];
775 	} else {
776 		ppm = ioc->params.qos[QOS_WPPM];
777 		lat = ioc->params.qos[QOS_WLAT];
778 	}
779 
780 	/*
781 	 * We want the period to be long enough to contain a healthy number
782 	 * of IOs while short enough for granular control.  Define it as a
783 	 * multiple of the latency target.  Ideally, the multiplier should
784 	 * be scaled according to the percentile so that it would nominally
785 	 * contain a certain number of requests.  Let's be simpler and
786 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
787 	 */
788 	if (ppm)
789 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
790 	else
791 		multi = 2;
792 	period_us = multi * lat;
793 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
794 
795 	/* calculate dependent params */
796 	ioc->period_us = period_us;
797 	ioc->timer_slack_ns = div64_u64(
798 		(u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
799 		100);
800 	ioc_refresh_margins(ioc);
801 }
802 
803 /*
804  *  ioc->rqos.disk isn't initialized when this function is called from
805  *  the init path.
806  */
807 static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
808 {
809 	int idx = ioc->autop_idx;
810 	const struct ioc_params *p = &autop[idx];
811 	u32 vrate_pct;
812 	u64 now_ns;
813 
814 	/* rotational? */
815 	if (!blk_queue_nonrot(disk->queue))
816 		return AUTOP_HDD;
817 
818 	/* handle SATA SSDs w/ broken NCQ */
819 	if (blk_queue_depth(disk->queue) == 1)
820 		return AUTOP_SSD_QD1;
821 
822 	/* use one of the normal ssd sets */
823 	if (idx < AUTOP_SSD_DFL)
824 		return AUTOP_SSD_DFL;
825 
826 	/* if user is overriding anything, maintain what was there */
827 	if (ioc->user_qos_params || ioc->user_cost_model)
828 		return idx;
829 
830 	/* step up/down based on the vrate */
831 	vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
832 	now_ns = blk_time_get_ns();
833 
834 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
835 		if (!ioc->autop_too_fast_at)
836 			ioc->autop_too_fast_at = now_ns;
837 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
838 			return idx + 1;
839 	} else {
840 		ioc->autop_too_fast_at = 0;
841 	}
842 
843 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
844 		if (!ioc->autop_too_slow_at)
845 			ioc->autop_too_slow_at = now_ns;
846 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
847 			return idx - 1;
848 	} else {
849 		ioc->autop_too_slow_at = 0;
850 	}
851 
852 	return idx;
853 }
854 
855 /*
856  * Take the followings as input
857  *
858  *  @bps	maximum sequential throughput
859  *  @seqiops	maximum sequential 4k iops
860  *  @randiops	maximum random 4k iops
861  *
862  * and calculate the linear model cost coefficients.
863  *
864  *  *@page	per-page cost		1s / (@bps / 4096)
865  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
866  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
867  */
868 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
869 			u64 *page, u64 *seqio, u64 *randio)
870 {
871 	u64 v;
872 
873 	*page = *seqio = *randio = 0;
874 
875 	if (bps) {
876 		u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
877 
878 		if (bps_pages)
879 			*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
880 		else
881 			*page = 1;
882 	}
883 
884 	if (seqiops) {
885 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
886 		if (v > *page)
887 			*seqio = v - *page;
888 	}
889 
890 	if (randiops) {
891 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
892 		if (v > *page)
893 			*randio = v - *page;
894 	}
895 }
896 
897 static void ioc_refresh_lcoefs(struct ioc *ioc)
898 {
899 	u64 *u = ioc->params.i_lcoefs;
900 	u64 *c = ioc->params.lcoefs;
901 
902 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
903 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
904 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
905 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
906 }
907 
908 /*
909  * struct gendisk is required as an argument because ioc->rqos.disk
910  * is not properly initialized when called from the init path.
911  */
912 static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
913 				    struct gendisk *disk)
914 {
915 	const struct ioc_params *p;
916 	int idx;
917 
918 	lockdep_assert_held(&ioc->lock);
919 
920 	idx = ioc_autop_idx(ioc, disk);
921 	p = &autop[idx];
922 
923 	if (idx == ioc->autop_idx && !force)
924 		return false;
925 
926 	if (idx != ioc->autop_idx) {
927 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
928 		ioc->vtime_base_rate = VTIME_PER_USEC;
929 	}
930 
931 	ioc->autop_idx = idx;
932 	ioc->autop_too_fast_at = 0;
933 	ioc->autop_too_slow_at = 0;
934 
935 	if (!ioc->user_qos_params)
936 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
937 	if (!ioc->user_cost_model)
938 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
939 
940 	ioc_refresh_period_us(ioc);
941 	ioc_refresh_lcoefs(ioc);
942 
943 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
944 					    VTIME_PER_USEC, MILLION);
945 	ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
946 					    VTIME_PER_USEC, MILLION);
947 
948 	return true;
949 }
950 
951 static bool ioc_refresh_params(struct ioc *ioc, bool force)
952 {
953 	return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk);
954 }
955 
956 /*
957  * When an iocg accumulates too much vtime or gets deactivated, we throw away
958  * some vtime, which lowers the overall device utilization. As the exact amount
959  * which is being thrown away is known, we can compensate by accelerating the
960  * vrate accordingly so that the extra vtime generated in the current period
961  * matches what got lost.
962  */
963 static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
964 {
965 	s64 pleft = ioc->period_at + ioc->period_us - now->now;
966 	s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
967 	s64 vcomp, vcomp_min, vcomp_max;
968 
969 	lockdep_assert_held(&ioc->lock);
970 
971 	/* we need some time left in this period */
972 	if (pleft <= 0)
973 		goto done;
974 
975 	/*
976 	 * Calculate how much vrate should be adjusted to offset the error.
977 	 * Limit the amount of adjustment and deduct the adjusted amount from
978 	 * the error.
979 	 */
980 	vcomp = -div64_s64(ioc->vtime_err, pleft);
981 	vcomp_min = -(ioc->vtime_base_rate >> 1);
982 	vcomp_max = ioc->vtime_base_rate;
983 	vcomp = clamp(vcomp, vcomp_min, vcomp_max);
984 
985 	ioc->vtime_err += vcomp * pleft;
986 
987 	atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
988 done:
989 	/* bound how much error can accumulate */
990 	ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
991 }
992 
993 static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
994 				  int nr_lagging, int nr_shortages,
995 				  int prev_busy_level, u32 *missed_ppm)
996 {
997 	u64 vrate = ioc->vtime_base_rate;
998 	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
999 
1000 	if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
1001 		if (ioc->busy_level != prev_busy_level || nr_lagging)
1002 			trace_iocost_ioc_vrate_adj(ioc, vrate,
1003 						   missed_ppm, rq_wait_pct,
1004 						   nr_lagging, nr_shortages);
1005 
1006 		return;
1007 	}
1008 
1009 	/*
1010 	 * If vrate is out of bounds, apply clamp gradually as the
1011 	 * bounds can change abruptly.  Otherwise, apply busy_level
1012 	 * based adjustment.
1013 	 */
1014 	if (vrate < vrate_min) {
1015 		vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
1016 		vrate = min(vrate, vrate_min);
1017 	} else if (vrate > vrate_max) {
1018 		vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1019 		vrate = max(vrate, vrate_max);
1020 	} else {
1021 		int idx = min_t(int, abs(ioc->busy_level),
1022 				ARRAY_SIZE(vrate_adj_pct) - 1);
1023 		u32 adj_pct = vrate_adj_pct[idx];
1024 
1025 		if (ioc->busy_level > 0)
1026 			adj_pct = 100 - adj_pct;
1027 		else
1028 			adj_pct = 100 + adj_pct;
1029 
1030 		vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1031 			      vrate_min, vrate_max);
1032 	}
1033 
1034 	trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1035 				   nr_lagging, nr_shortages);
1036 
1037 	ioc->vtime_base_rate = vrate;
1038 	ioc_refresh_margins(ioc);
1039 }
1040 
1041 /* take a snapshot of the current [v]time and vrate */
1042 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1043 {
1044 	unsigned seq;
1045 	u64 vrate;
1046 
1047 	now->now_ns = blk_time_get_ns();
1048 	now->now = ktime_to_us(now->now_ns);
1049 	vrate = atomic64_read(&ioc->vtime_rate);
1050 
1051 	/*
1052 	 * The current vtime is
1053 	 *
1054 	 *   vtime at period start + (wallclock time since the start) * vrate
1055 	 *
1056 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
1057 	 * needed, they're seqcount protected.
1058 	 */
1059 	do {
1060 		seq = read_seqcount_begin(&ioc->period_seqcount);
1061 		now->vnow = ioc->period_at_vtime +
1062 			(now->now - ioc->period_at) * vrate;
1063 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
1064 }
1065 
1066 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1067 {
1068 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1069 
1070 	write_seqcount_begin(&ioc->period_seqcount);
1071 	ioc->period_at = now->now;
1072 	ioc->period_at_vtime = now->vnow;
1073 	write_seqcount_end(&ioc->period_seqcount);
1074 
1075 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1076 	add_timer(&ioc->timer);
1077 }
1078 
1079 /*
1080  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
1081  * weight sums and propagate upwards accordingly. If @save, the current margin
1082  * is saved to be used as reference for later inuse in-period adjustments.
1083  */
1084 static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1085 				bool save, struct ioc_now *now)
1086 {
1087 	struct ioc *ioc = iocg->ioc;
1088 	int lvl;
1089 
1090 	lockdep_assert_held(&ioc->lock);
1091 
1092 	/*
1093 	 * For an active leaf node, its inuse shouldn't be zero or exceed
1094 	 * @active. An active internal node's inuse is solely determined by the
1095 	 * inuse to active ratio of its children regardless of @inuse.
1096 	 */
1097 	if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1098 		inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1099 					   iocg->child_active_sum);
1100 	} else {
1101 		inuse = clamp_t(u32, inuse, 1, active);
1102 	}
1103 
1104 	iocg->last_inuse = iocg->inuse;
1105 	if (save)
1106 		iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1107 
1108 	if (active == iocg->active && inuse == iocg->inuse)
1109 		return;
1110 
1111 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1112 		struct ioc_gq *parent = iocg->ancestors[lvl];
1113 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1114 		u32 parent_active = 0, parent_inuse = 0;
1115 
1116 		/* update the level sums */
1117 		parent->child_active_sum += (s32)(active - child->active);
1118 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
1119 		/* apply the updates */
1120 		child->active = active;
1121 		child->inuse = inuse;
1122 
1123 		/*
1124 		 * The delta between inuse and active sums indicates that
1125 		 * much of weight is being given away.  Parent's inuse
1126 		 * and active should reflect the ratio.
1127 		 */
1128 		if (parent->child_active_sum) {
1129 			parent_active = parent->weight;
1130 			parent_inuse = DIV64_U64_ROUND_UP(
1131 				parent_active * parent->child_inuse_sum,
1132 				parent->child_active_sum);
1133 		}
1134 
1135 		/* do we need to keep walking up? */
1136 		if (parent_active == parent->active &&
1137 		    parent_inuse == parent->inuse)
1138 			break;
1139 
1140 		active = parent_active;
1141 		inuse = parent_inuse;
1142 	}
1143 
1144 	ioc->weights_updated = true;
1145 }
1146 
1147 static void commit_weights(struct ioc *ioc)
1148 {
1149 	lockdep_assert_held(&ioc->lock);
1150 
1151 	if (ioc->weights_updated) {
1152 		/* paired with rmb in current_hweight(), see there */
1153 		smp_wmb();
1154 		atomic_inc(&ioc->hweight_gen);
1155 		ioc->weights_updated = false;
1156 	}
1157 }
1158 
1159 static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1160 			      bool save, struct ioc_now *now)
1161 {
1162 	__propagate_weights(iocg, active, inuse, save, now);
1163 	commit_weights(iocg->ioc);
1164 }
1165 
1166 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1167 {
1168 	struct ioc *ioc = iocg->ioc;
1169 	int lvl;
1170 	u32 hwa, hwi;
1171 	int ioc_gen;
1172 
1173 	/* hot path - if uptodate, use cached */
1174 	ioc_gen = atomic_read(&ioc->hweight_gen);
1175 	if (ioc_gen == iocg->hweight_gen)
1176 		goto out;
1177 
1178 	/*
1179 	 * Paired with wmb in commit_weights(). If we saw the updated
1180 	 * hweight_gen, all the weight updates from __propagate_weights() are
1181 	 * visible too.
1182 	 *
1183 	 * We can race with weight updates during calculation and get it
1184 	 * wrong.  However, hweight_gen would have changed and a future
1185 	 * reader will recalculate and we're guaranteed to discard the
1186 	 * wrong result soon.
1187 	 */
1188 	smp_rmb();
1189 
1190 	hwa = hwi = WEIGHT_ONE;
1191 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1192 		struct ioc_gq *parent = iocg->ancestors[lvl];
1193 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1194 		u64 active_sum = READ_ONCE(parent->child_active_sum);
1195 		u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1196 		u32 active = READ_ONCE(child->active);
1197 		u32 inuse = READ_ONCE(child->inuse);
1198 
1199 		/* we can race with deactivations and either may read as zero */
1200 		if (!active_sum || !inuse_sum)
1201 			continue;
1202 
1203 		active_sum = max_t(u64, active, active_sum);
1204 		hwa = div64_u64((u64)hwa * active, active_sum);
1205 
1206 		inuse_sum = max_t(u64, inuse, inuse_sum);
1207 		hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1208 	}
1209 
1210 	iocg->hweight_active = max_t(u32, hwa, 1);
1211 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1212 	iocg->hweight_gen = ioc_gen;
1213 out:
1214 	if (hw_activep)
1215 		*hw_activep = iocg->hweight_active;
1216 	if (hw_inusep)
1217 		*hw_inusep = iocg->hweight_inuse;
1218 }
1219 
1220 /*
1221  * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1222  * other weights stay unchanged.
1223  */
1224 static u32 current_hweight_max(struct ioc_gq *iocg)
1225 {
1226 	u32 hwm = WEIGHT_ONE;
1227 	u32 inuse = iocg->active;
1228 	u64 child_inuse_sum;
1229 	int lvl;
1230 
1231 	lockdep_assert_held(&iocg->ioc->lock);
1232 
1233 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1234 		struct ioc_gq *parent = iocg->ancestors[lvl];
1235 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1236 
1237 		child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1238 		hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1239 		inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1240 					   parent->child_active_sum);
1241 	}
1242 
1243 	return max_t(u32, hwm, 1);
1244 }
1245 
1246 static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1247 {
1248 	struct ioc *ioc = iocg->ioc;
1249 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1250 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1251 	u32 weight;
1252 
1253 	lockdep_assert_held(&ioc->lock);
1254 
1255 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1256 	if (weight != iocg->weight && iocg->active)
1257 		propagate_weights(iocg, weight, iocg->inuse, true, now);
1258 	iocg->weight = weight;
1259 }
1260 
1261 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1262 {
1263 	struct ioc *ioc = iocg->ioc;
1264 	u64 __maybe_unused last_period, cur_period;
1265 	u64 vtime, vtarget;
1266 	int i;
1267 
1268 	/*
1269 	 * If seem to be already active, just update the stamp to tell the
1270 	 * timer that we're still active.  We don't mind occassional races.
1271 	 */
1272 	if (!list_empty(&iocg->active_list)) {
1273 		ioc_now(ioc, now);
1274 		cur_period = atomic64_read(&ioc->cur_period);
1275 		if (atomic64_read(&iocg->active_period) != cur_period)
1276 			atomic64_set(&iocg->active_period, cur_period);
1277 		return true;
1278 	}
1279 
1280 	/* racy check on internal node IOs, treat as root level IOs */
1281 	if (iocg->child_active_sum)
1282 		return false;
1283 
1284 	spin_lock_irq(&ioc->lock);
1285 
1286 	ioc_now(ioc, now);
1287 
1288 	/* update period */
1289 	cur_period = atomic64_read(&ioc->cur_period);
1290 	last_period = atomic64_read(&iocg->active_period);
1291 	atomic64_set(&iocg->active_period, cur_period);
1292 
1293 	/* already activated or breaking leaf-only constraint? */
1294 	if (!list_empty(&iocg->active_list))
1295 		goto succeed_unlock;
1296 	for (i = iocg->level - 1; i > 0; i--)
1297 		if (!list_empty(&iocg->ancestors[i]->active_list))
1298 			goto fail_unlock;
1299 
1300 	if (iocg->child_active_sum)
1301 		goto fail_unlock;
1302 
1303 	/*
1304 	 * Always start with the target budget. On deactivation, we throw away
1305 	 * anything above it.
1306 	 */
1307 	vtarget = now->vnow - ioc->margins.target;
1308 	vtime = atomic64_read(&iocg->vtime);
1309 
1310 	atomic64_add(vtarget - vtime, &iocg->vtime);
1311 	atomic64_add(vtarget - vtime, &iocg->done_vtime);
1312 	vtime = vtarget;
1313 
1314 	/*
1315 	 * Activate, propagate weight and start period timer if not
1316 	 * running.  Reset hweight_gen to avoid accidental match from
1317 	 * wrapping.
1318 	 */
1319 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1320 	list_add(&iocg->active_list, &ioc->active_iocgs);
1321 
1322 	propagate_weights(iocg, iocg->weight,
1323 			  iocg->last_inuse ?: iocg->weight, true, now);
1324 
1325 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1326 			last_period, cur_period, vtime);
1327 
1328 	iocg->activated_at = now->now;
1329 
1330 	if (ioc->running == IOC_IDLE) {
1331 		ioc->running = IOC_RUNNING;
1332 		ioc->dfgv_period_at = now->now;
1333 		ioc->dfgv_period_rem = 0;
1334 		ioc_start_period(ioc, now);
1335 	}
1336 
1337 succeed_unlock:
1338 	spin_unlock_irq(&ioc->lock);
1339 	return true;
1340 
1341 fail_unlock:
1342 	spin_unlock_irq(&ioc->lock);
1343 	return false;
1344 }
1345 
1346 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1347 {
1348 	struct ioc *ioc = iocg->ioc;
1349 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1350 	u64 tdelta, delay, new_delay, shift;
1351 	s64 vover, vover_pct;
1352 	u32 hwa;
1353 
1354 	lockdep_assert_held(&iocg->waitq.lock);
1355 
1356 	/*
1357 	 * If the delay is set by another CPU, we may be in the past. No need to
1358 	 * change anything if so. This avoids decay calculation underflow.
1359 	 */
1360 	if (time_before64(now->now, iocg->delay_at))
1361 		return false;
1362 
1363 	/* calculate the current delay in effect - 1/2 every second */
1364 	tdelta = now->now - iocg->delay_at;
1365 	shift = div64_u64(tdelta, USEC_PER_SEC);
1366 	if (iocg->delay && shift < BITS_PER_LONG)
1367 		delay = iocg->delay >> shift;
1368 	else
1369 		delay = 0;
1370 
1371 	/* calculate the new delay from the debt amount */
1372 	current_hweight(iocg, &hwa, NULL);
1373 	vover = atomic64_read(&iocg->vtime) +
1374 		abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1375 	vover_pct = div64_s64(100 * vover,
1376 			      ioc->period_us * ioc->vtime_base_rate);
1377 
1378 	if (vover_pct <= MIN_DELAY_THR_PCT)
1379 		new_delay = 0;
1380 	else if (vover_pct >= MAX_DELAY_THR_PCT)
1381 		new_delay = MAX_DELAY;
1382 	else
1383 		new_delay = MIN_DELAY +
1384 			div_u64((MAX_DELAY - MIN_DELAY) *
1385 				(vover_pct - MIN_DELAY_THR_PCT),
1386 				MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1387 
1388 	/* pick the higher one and apply */
1389 	if (new_delay > delay) {
1390 		iocg->delay = new_delay;
1391 		iocg->delay_at = now->now;
1392 		delay = new_delay;
1393 	}
1394 
1395 	if (delay >= MIN_DELAY) {
1396 		if (!iocg->indelay_since)
1397 			iocg->indelay_since = now->now;
1398 		blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1399 		return true;
1400 	} else {
1401 		if (iocg->indelay_since) {
1402 			iocg->stat.indelay_us += now->now - iocg->indelay_since;
1403 			iocg->indelay_since = 0;
1404 		}
1405 		iocg->delay = 0;
1406 		blkcg_clear_delay(blkg);
1407 		return false;
1408 	}
1409 }
1410 
1411 static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1412 			    struct ioc_now *now)
1413 {
1414 	struct iocg_pcpu_stat *gcs;
1415 
1416 	lockdep_assert_held(&iocg->ioc->lock);
1417 	lockdep_assert_held(&iocg->waitq.lock);
1418 	WARN_ON_ONCE(list_empty(&iocg->active_list));
1419 
1420 	/*
1421 	 * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1422 	 * inuse donating all of it share to others until its debt is paid off.
1423 	 */
1424 	if (!iocg->abs_vdebt && abs_cost) {
1425 		iocg->indebt_since = now->now;
1426 		propagate_weights(iocg, iocg->active, 0, false, now);
1427 	}
1428 
1429 	iocg->abs_vdebt += abs_cost;
1430 
1431 	gcs = get_cpu_ptr(iocg->pcpu_stat);
1432 	local64_add(abs_cost, &gcs->abs_vusage);
1433 	put_cpu_ptr(gcs);
1434 }
1435 
1436 static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1437 			  struct ioc_now *now)
1438 {
1439 	lockdep_assert_held(&iocg->ioc->lock);
1440 	lockdep_assert_held(&iocg->waitq.lock);
1441 
1442 	/* make sure that nobody messed with @iocg */
1443 	WARN_ON_ONCE(list_empty(&iocg->active_list));
1444 	WARN_ON_ONCE(iocg->inuse > 1);
1445 
1446 	iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1447 
1448 	/* if debt is paid in full, restore inuse */
1449 	if (!iocg->abs_vdebt) {
1450 		iocg->stat.indebt_us += now->now - iocg->indebt_since;
1451 		iocg->indebt_since = 0;
1452 
1453 		propagate_weights(iocg, iocg->active, iocg->last_inuse,
1454 				  false, now);
1455 	}
1456 }
1457 
1458 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1459 			int flags, void *key)
1460 {
1461 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1462 	struct iocg_wake_ctx *ctx = key;
1463 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1464 
1465 	ctx->vbudget -= cost;
1466 
1467 	if (ctx->vbudget < 0)
1468 		return -1;
1469 
1470 	iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1471 	wait->committed = true;
1472 
1473 	/*
1474 	 * autoremove_wake_function() removes the wait entry only when it
1475 	 * actually changed the task state. We want the wait always removed.
1476 	 * Remove explicitly and use default_wake_function(). Note that the
1477 	 * order of operations is important as finish_wait() tests whether
1478 	 * @wq_entry is removed without grabbing the lock.
1479 	 */
1480 	default_wake_function(wq_entry, mode, flags, key);
1481 	list_del_init_careful(&wq_entry->entry);
1482 	return 0;
1483 }
1484 
1485 /*
1486  * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1487  * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1488  * addition to iocg->waitq.lock.
1489  */
1490 static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1491 			    struct ioc_now *now)
1492 {
1493 	struct ioc *ioc = iocg->ioc;
1494 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1495 	u64 vshortage, expires, oexpires;
1496 	s64 vbudget;
1497 	u32 hwa;
1498 
1499 	lockdep_assert_held(&iocg->waitq.lock);
1500 
1501 	current_hweight(iocg, &hwa, NULL);
1502 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1503 
1504 	/* pay off debt */
1505 	if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1506 		u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1507 		u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1508 		u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1509 
1510 		lockdep_assert_held(&ioc->lock);
1511 
1512 		atomic64_add(vpay, &iocg->vtime);
1513 		atomic64_add(vpay, &iocg->done_vtime);
1514 		iocg_pay_debt(iocg, abs_vpay, now);
1515 		vbudget -= vpay;
1516 	}
1517 
1518 	if (iocg->abs_vdebt || iocg->delay)
1519 		iocg_kick_delay(iocg, now);
1520 
1521 	/*
1522 	 * Debt can still be outstanding if we haven't paid all yet or the
1523 	 * caller raced and called without @pay_debt. Shouldn't wake up waiters
1524 	 * under debt. Make sure @vbudget reflects the outstanding amount and is
1525 	 * not positive.
1526 	 */
1527 	if (iocg->abs_vdebt) {
1528 		s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1529 		vbudget = min_t(s64, 0, vbudget - vdebt);
1530 	}
1531 
1532 	/*
1533 	 * Wake up the ones which are due and see how much vtime we'll need for
1534 	 * the next one. As paying off debt restores hw_inuse, it must be read
1535 	 * after the above debt payment.
1536 	 */
1537 	ctx.vbudget = vbudget;
1538 	current_hweight(iocg, NULL, &ctx.hw_inuse);
1539 
1540 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1541 
1542 	if (!waitqueue_active(&iocg->waitq)) {
1543 		if (iocg->wait_since) {
1544 			iocg->stat.wait_us += now->now - iocg->wait_since;
1545 			iocg->wait_since = 0;
1546 		}
1547 		return;
1548 	}
1549 
1550 	if (!iocg->wait_since)
1551 		iocg->wait_since = now->now;
1552 
1553 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1554 		return;
1555 
1556 	/* determine next wakeup, add a timer margin to guarantee chunking */
1557 	vshortage = -ctx.vbudget;
1558 	expires = now->now_ns +
1559 		DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1560 		NSEC_PER_USEC;
1561 	expires += ioc->timer_slack_ns;
1562 
1563 	/* if already active and close enough, don't bother */
1564 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1565 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1566 	    abs(oexpires - expires) <= ioc->timer_slack_ns)
1567 		return;
1568 
1569 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1570 			       ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1571 }
1572 
1573 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1574 {
1575 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1576 	bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1577 	struct ioc_now now;
1578 	unsigned long flags;
1579 
1580 	ioc_now(iocg->ioc, &now);
1581 
1582 	iocg_lock(iocg, pay_debt, &flags);
1583 	iocg_kick_waitq(iocg, pay_debt, &now);
1584 	iocg_unlock(iocg, pay_debt, &flags);
1585 
1586 	return HRTIMER_NORESTART;
1587 }
1588 
1589 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1590 {
1591 	u32 nr_met[2] = { };
1592 	u32 nr_missed[2] = { };
1593 	u64 rq_wait_ns = 0;
1594 	int cpu, rw;
1595 
1596 	for_each_online_cpu(cpu) {
1597 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1598 		u64 this_rq_wait_ns;
1599 
1600 		for (rw = READ; rw <= WRITE; rw++) {
1601 			u32 this_met = local_read(&stat->missed[rw].nr_met);
1602 			u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1603 
1604 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1605 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1606 			stat->missed[rw].last_met = this_met;
1607 			stat->missed[rw].last_missed = this_missed;
1608 		}
1609 
1610 		this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1611 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1612 		stat->last_rq_wait_ns = this_rq_wait_ns;
1613 	}
1614 
1615 	for (rw = READ; rw <= WRITE; rw++) {
1616 		if (nr_met[rw] + nr_missed[rw])
1617 			missed_ppm_ar[rw] =
1618 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1619 						   nr_met[rw] + nr_missed[rw]);
1620 		else
1621 			missed_ppm_ar[rw] = 0;
1622 	}
1623 
1624 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1625 				   ioc->period_us * NSEC_PER_USEC);
1626 }
1627 
1628 /* was iocg idle this period? */
1629 static bool iocg_is_idle(struct ioc_gq *iocg)
1630 {
1631 	struct ioc *ioc = iocg->ioc;
1632 
1633 	/* did something get issued this period? */
1634 	if (atomic64_read(&iocg->active_period) ==
1635 	    atomic64_read(&ioc->cur_period))
1636 		return false;
1637 
1638 	/* is something in flight? */
1639 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1640 		return false;
1641 
1642 	return true;
1643 }
1644 
1645 /*
1646  * Call this function on the target leaf @iocg's to build pre-order traversal
1647  * list of all the ancestors in @inner_walk. The inner nodes are linked through
1648  * ->walk_list and the caller is responsible for dissolving the list after use.
1649  */
1650 static void iocg_build_inner_walk(struct ioc_gq *iocg,
1651 				  struct list_head *inner_walk)
1652 {
1653 	int lvl;
1654 
1655 	WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1656 
1657 	/* find the first ancestor which hasn't been visited yet */
1658 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1659 		if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1660 			break;
1661 	}
1662 
1663 	/* walk down and visit the inner nodes to get pre-order traversal */
1664 	while (++lvl <= iocg->level - 1) {
1665 		struct ioc_gq *inner = iocg->ancestors[lvl];
1666 
1667 		/* record traversal order */
1668 		list_add_tail(&inner->walk_list, inner_walk);
1669 	}
1670 }
1671 
1672 /* propagate the deltas to the parent */
1673 static void iocg_flush_stat_upward(struct ioc_gq *iocg)
1674 {
1675 	if (iocg->level > 0) {
1676 		struct iocg_stat *parent_stat =
1677 			&iocg->ancestors[iocg->level - 1]->stat;
1678 
1679 		parent_stat->usage_us +=
1680 			iocg->stat.usage_us - iocg->last_stat.usage_us;
1681 		parent_stat->wait_us +=
1682 			iocg->stat.wait_us - iocg->last_stat.wait_us;
1683 		parent_stat->indebt_us +=
1684 			iocg->stat.indebt_us - iocg->last_stat.indebt_us;
1685 		parent_stat->indelay_us +=
1686 			iocg->stat.indelay_us - iocg->last_stat.indelay_us;
1687 	}
1688 
1689 	iocg->last_stat = iocg->stat;
1690 }
1691 
1692 /* collect per-cpu counters and propagate the deltas to the parent */
1693 static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
1694 {
1695 	struct ioc *ioc = iocg->ioc;
1696 	u64 abs_vusage = 0;
1697 	u64 vusage_delta;
1698 	int cpu;
1699 
1700 	lockdep_assert_held(&iocg->ioc->lock);
1701 
1702 	/* collect per-cpu counters */
1703 	for_each_possible_cpu(cpu) {
1704 		abs_vusage += local64_read(
1705 				per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1706 	}
1707 	vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1708 	iocg->last_stat_abs_vusage = abs_vusage;
1709 
1710 	iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1711 	iocg->stat.usage_us += iocg->usage_delta_us;
1712 
1713 	iocg_flush_stat_upward(iocg);
1714 }
1715 
1716 /* get stat counters ready for reading on all active iocgs */
1717 static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1718 {
1719 	LIST_HEAD(inner_walk);
1720 	struct ioc_gq *iocg, *tiocg;
1721 
1722 	/* flush leaves and build inner node walk list */
1723 	list_for_each_entry(iocg, target_iocgs, active_list) {
1724 		iocg_flush_stat_leaf(iocg, now);
1725 		iocg_build_inner_walk(iocg, &inner_walk);
1726 	}
1727 
1728 	/* keep flushing upwards by walking the inner list backwards */
1729 	list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1730 		iocg_flush_stat_upward(iocg);
1731 		list_del_init(&iocg->walk_list);
1732 	}
1733 }
1734 
1735 /*
1736  * Determine what @iocg's hweight_inuse should be after donating unused
1737  * capacity. @hwm is the upper bound and used to signal no donation. This
1738  * function also throws away @iocg's excess budget.
1739  */
1740 static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1741 				  u32 usage, struct ioc_now *now)
1742 {
1743 	struct ioc *ioc = iocg->ioc;
1744 	u64 vtime = atomic64_read(&iocg->vtime);
1745 	s64 excess, delta, target, new_hwi;
1746 
1747 	/* debt handling owns inuse for debtors */
1748 	if (iocg->abs_vdebt)
1749 		return 1;
1750 
1751 	/* see whether minimum margin requirement is met */
1752 	if (waitqueue_active(&iocg->waitq) ||
1753 	    time_after64(vtime, now->vnow - ioc->margins.min))
1754 		return hwm;
1755 
1756 	/* throw away excess above target */
1757 	excess = now->vnow - vtime - ioc->margins.target;
1758 	if (excess > 0) {
1759 		atomic64_add(excess, &iocg->vtime);
1760 		atomic64_add(excess, &iocg->done_vtime);
1761 		vtime += excess;
1762 		ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1763 	}
1764 
1765 	/*
1766 	 * Let's say the distance between iocg's and device's vtimes as a
1767 	 * fraction of period duration is delta. Assuming that the iocg will
1768 	 * consume the usage determined above, we want to determine new_hwi so
1769 	 * that delta equals MARGIN_TARGET at the end of the next period.
1770 	 *
1771 	 * We need to execute usage worth of IOs while spending the sum of the
1772 	 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1773 	 * (delta):
1774 	 *
1775 	 *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
1776 	 *
1777 	 * Therefore, the new_hwi is:
1778 	 *
1779 	 *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
1780 	 */
1781 	delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1782 			  now->vnow - ioc->period_at_vtime);
1783 	target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1784 	new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1785 
1786 	return clamp_t(s64, new_hwi, 1, hwm);
1787 }
1788 
1789 /*
1790  * For work-conservation, an iocg which isn't using all of its share should
1791  * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1792  * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1793  *
1794  * #1 is mathematically simpler but has the drawback of requiring synchronous
1795  * global hweight_inuse updates when idle iocg's get activated or inuse weights
1796  * change due to donation snapbacks as it has the possibility of grossly
1797  * overshooting what's allowed by the model and vrate.
1798  *
1799  * #2 is inherently safe with local operations. The donating iocg can easily
1800  * snap back to higher weights when needed without worrying about impacts on
1801  * other nodes as the impacts will be inherently correct. This also makes idle
1802  * iocg activations safe. The only effect activations have is decreasing
1803  * hweight_inuse of others, the right solution to which is for those iocgs to
1804  * snap back to higher weights.
1805  *
1806  * So, we go with #2. The challenge is calculating how each donating iocg's
1807  * inuse should be adjusted to achieve the target donation amounts. This is done
1808  * using Andy's method described in the following pdf.
1809  *
1810  *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1811  *
1812  * Given the weights and target after-donation hweight_inuse values, Andy's
1813  * method determines how the proportional distribution should look like at each
1814  * sibling level to maintain the relative relationship between all non-donating
1815  * pairs. To roughly summarize, it divides the tree into donating and
1816  * non-donating parts, calculates global donation rate which is used to
1817  * determine the target hweight_inuse for each node, and then derives per-level
1818  * proportions.
1819  *
1820  * The following pdf shows that global distribution calculated this way can be
1821  * achieved by scaling inuse weights of donating leaves and propagating the
1822  * adjustments upwards proportionally.
1823  *
1824  *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1825  *
1826  * Combining the above two, we can determine how each leaf iocg's inuse should
1827  * be adjusted to achieve the target donation.
1828  *
1829  *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1830  *
1831  * The inline comments use symbols from the last pdf.
1832  *
1833  *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
1834  *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
1835  *   t is the sum of the absolute budgets of donating nodes in the subtree.
1836  *   w is the weight of the node. w = w_f + w_t
1837  *   w_f is the non-donating portion of w. w_f = w * f / b
1838  *   w_b is the donating portion of w. w_t = w * t / b
1839  *   s is the sum of all sibling weights. s = Sum(w) for siblings
1840  *   s_f and s_t are the non-donating and donating portions of s.
1841  *
1842  * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1843  * w_pt is the donating portion of the parent's weight and w'_pt the same value
1844  * after adjustments. Subscript r denotes the root node's values.
1845  */
1846 static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1847 {
1848 	LIST_HEAD(over_hwa);
1849 	LIST_HEAD(inner_walk);
1850 	struct ioc_gq *iocg, *tiocg, *root_iocg;
1851 	u32 after_sum, over_sum, over_target, gamma;
1852 
1853 	/*
1854 	 * It's pretty unlikely but possible for the total sum of
1855 	 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1856 	 * confuse the following calculations. If such condition is detected,
1857 	 * scale down everyone over its full share equally to keep the sum below
1858 	 * WEIGHT_ONE.
1859 	 */
1860 	after_sum = 0;
1861 	over_sum = 0;
1862 	list_for_each_entry(iocg, surpluses, surplus_list) {
1863 		u32 hwa;
1864 
1865 		current_hweight(iocg, &hwa, NULL);
1866 		after_sum += iocg->hweight_after_donation;
1867 
1868 		if (iocg->hweight_after_donation > hwa) {
1869 			over_sum += iocg->hweight_after_donation;
1870 			list_add(&iocg->walk_list, &over_hwa);
1871 		}
1872 	}
1873 
1874 	if (after_sum >= WEIGHT_ONE) {
1875 		/*
1876 		 * The delta should be deducted from the over_sum, calculate
1877 		 * target over_sum value.
1878 		 */
1879 		u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1880 		WARN_ON_ONCE(over_sum <= over_delta);
1881 		over_target = over_sum - over_delta;
1882 	} else {
1883 		over_target = 0;
1884 	}
1885 
1886 	list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1887 		if (over_target)
1888 			iocg->hweight_after_donation =
1889 				div_u64((u64)iocg->hweight_after_donation *
1890 					over_target, over_sum);
1891 		list_del_init(&iocg->walk_list);
1892 	}
1893 
1894 	/*
1895 	 * Build pre-order inner node walk list and prepare for donation
1896 	 * adjustment calculations.
1897 	 */
1898 	list_for_each_entry(iocg, surpluses, surplus_list) {
1899 		iocg_build_inner_walk(iocg, &inner_walk);
1900 	}
1901 
1902 	root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1903 	WARN_ON_ONCE(root_iocg->level > 0);
1904 
1905 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1906 		iocg->child_adjusted_sum = 0;
1907 		iocg->hweight_donating = 0;
1908 		iocg->hweight_after_donation = 0;
1909 	}
1910 
1911 	/*
1912 	 * Propagate the donating budget (b_t) and after donation budget (b'_t)
1913 	 * up the hierarchy.
1914 	 */
1915 	list_for_each_entry(iocg, surpluses, surplus_list) {
1916 		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1917 
1918 		parent->hweight_donating += iocg->hweight_donating;
1919 		parent->hweight_after_donation += iocg->hweight_after_donation;
1920 	}
1921 
1922 	list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1923 		if (iocg->level > 0) {
1924 			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1925 
1926 			parent->hweight_donating += iocg->hweight_donating;
1927 			parent->hweight_after_donation += iocg->hweight_after_donation;
1928 		}
1929 	}
1930 
1931 	/*
1932 	 * Calculate inner hwa's (b) and make sure the donation values are
1933 	 * within the accepted ranges as we're doing low res calculations with
1934 	 * roundups.
1935 	 */
1936 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1937 		if (iocg->level) {
1938 			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1939 
1940 			iocg->hweight_active = DIV64_U64_ROUND_UP(
1941 				(u64)parent->hweight_active * iocg->active,
1942 				parent->child_active_sum);
1943 
1944 		}
1945 
1946 		iocg->hweight_donating = min(iocg->hweight_donating,
1947 					     iocg->hweight_active);
1948 		iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1949 						   iocg->hweight_donating - 1);
1950 		if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1951 				 iocg->hweight_donating <= 1 ||
1952 				 iocg->hweight_after_donation == 0)) {
1953 			pr_warn("iocg: invalid donation weights in ");
1954 			pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1955 			pr_cont(": active=%u donating=%u after=%u\n",
1956 				iocg->hweight_active, iocg->hweight_donating,
1957 				iocg->hweight_after_donation);
1958 		}
1959 	}
1960 
1961 	/*
1962 	 * Calculate the global donation rate (gamma) - the rate to adjust
1963 	 * non-donating budgets by.
1964 	 *
1965 	 * No need to use 64bit multiplication here as the first operand is
1966 	 * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1967 	 *
1968 	 * We know that there are beneficiary nodes and the sum of the donating
1969 	 * hweights can't be whole; however, due to the round-ups during hweight
1970 	 * calculations, root_iocg->hweight_donating might still end up equal to
1971 	 * or greater than whole. Limit the range when calculating the divider.
1972 	 *
1973 	 * gamma = (1 - t_r') / (1 - t_r)
1974 	 */
1975 	gamma = DIV_ROUND_UP(
1976 		(WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1977 		WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1978 
1979 	/*
1980 	 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1981 	 * nodes.
1982 	 */
1983 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1984 		struct ioc_gq *parent;
1985 		u32 inuse, wpt, wptp;
1986 		u64 st, sf;
1987 
1988 		if (iocg->level == 0) {
1989 			/* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1990 			iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1991 				iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1992 				WEIGHT_ONE - iocg->hweight_after_donation);
1993 			continue;
1994 		}
1995 
1996 		parent = iocg->ancestors[iocg->level - 1];
1997 
1998 		/* b' = gamma * b_f + b_t' */
1999 		iocg->hweight_inuse = DIV64_U64_ROUND_UP(
2000 			(u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
2001 			WEIGHT_ONE) + iocg->hweight_after_donation;
2002 
2003 		/* w' = s' * b' / b'_p */
2004 		inuse = DIV64_U64_ROUND_UP(
2005 			(u64)parent->child_adjusted_sum * iocg->hweight_inuse,
2006 			parent->hweight_inuse);
2007 
2008 		/* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
2009 		st = DIV64_U64_ROUND_UP(
2010 			iocg->child_active_sum * iocg->hweight_donating,
2011 			iocg->hweight_active);
2012 		sf = iocg->child_active_sum - st;
2013 		wpt = DIV64_U64_ROUND_UP(
2014 			(u64)iocg->active * iocg->hweight_donating,
2015 			iocg->hweight_active);
2016 		wptp = DIV64_U64_ROUND_UP(
2017 			(u64)inuse * iocg->hweight_after_donation,
2018 			iocg->hweight_inuse);
2019 
2020 		iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
2021 	}
2022 
2023 	/*
2024 	 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
2025 	 * we can finally determine leaf adjustments.
2026 	 */
2027 	list_for_each_entry(iocg, surpluses, surplus_list) {
2028 		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2029 		u32 inuse;
2030 
2031 		/*
2032 		 * In-debt iocgs participated in the donation calculation with
2033 		 * the minimum target hweight_inuse. Configuring inuse
2034 		 * accordingly would work fine but debt handling expects
2035 		 * @iocg->inuse stay at the minimum and we don't wanna
2036 		 * interfere.
2037 		 */
2038 		if (iocg->abs_vdebt) {
2039 			WARN_ON_ONCE(iocg->inuse > 1);
2040 			continue;
2041 		}
2042 
2043 		/* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
2044 		inuse = DIV64_U64_ROUND_UP(
2045 			parent->child_adjusted_sum * iocg->hweight_after_donation,
2046 			parent->hweight_inuse);
2047 
2048 		TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2049 				iocg->inuse, inuse,
2050 				iocg->hweight_inuse,
2051 				iocg->hweight_after_donation);
2052 
2053 		__propagate_weights(iocg, iocg->active, inuse, true, now);
2054 	}
2055 
2056 	/* walk list should be dissolved after use */
2057 	list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2058 		list_del_init(&iocg->walk_list);
2059 }
2060 
2061 /*
2062  * A low weight iocg can amass a large amount of debt, for example, when
2063  * anonymous memory gets reclaimed aggressively. If the system has a lot of
2064  * memory paired with a slow IO device, the debt can span multiple seconds or
2065  * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2066  * up blocked paying its debt while the IO device is idle.
2067  *
2068  * The following protects against such cases. If the device has been
2069  * sufficiently idle for a while, the debts are halved and delays are
2070  * recalculated.
2071  */
2072 static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
2073 			      struct ioc_now *now)
2074 {
2075 	struct ioc_gq *iocg;
2076 	u64 dur, usage_pct, nr_cycles;
2077 
2078 	/* if no debtor, reset the cycle */
2079 	if (!nr_debtors) {
2080 		ioc->dfgv_period_at = now->now;
2081 		ioc->dfgv_period_rem = 0;
2082 		ioc->dfgv_usage_us_sum = 0;
2083 		return;
2084 	}
2085 
2086 	/*
2087 	 * Debtors can pass through a lot of writes choking the device and we
2088 	 * don't want to be forgiving debts while the device is struggling from
2089 	 * write bursts. If we're missing latency targets, consider the device
2090 	 * fully utilized.
2091 	 */
2092 	if (ioc->busy_level > 0)
2093 		usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2094 
2095 	ioc->dfgv_usage_us_sum += usage_us_sum;
2096 	if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2097 		return;
2098 
2099 	/*
2100 	 * At least DFGV_PERIOD has passed since the last period. Calculate the
2101 	 * average usage and reset the period counters.
2102 	 */
2103 	dur = now->now - ioc->dfgv_period_at;
2104 	usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2105 
2106 	ioc->dfgv_period_at = now->now;
2107 	ioc->dfgv_usage_us_sum = 0;
2108 
2109 	/* if was too busy, reset everything */
2110 	if (usage_pct > DFGV_USAGE_PCT) {
2111 		ioc->dfgv_period_rem = 0;
2112 		return;
2113 	}
2114 
2115 	/*
2116 	 * Usage is lower than threshold. Let's forgive some debts. Debt
2117 	 * forgiveness runs off of the usual ioc timer but its period usually
2118 	 * doesn't match ioc's. Compensate the difference by performing the
2119 	 * reduction as many times as would fit in the duration since the last
2120 	 * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2121 	 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2122 	 * reductions is doubled.
2123 	 */
2124 	nr_cycles = dur + ioc->dfgv_period_rem;
2125 	ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2126 
2127 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2128 		u64 __maybe_unused old_debt, __maybe_unused old_delay;
2129 
2130 		if (!iocg->abs_vdebt && !iocg->delay)
2131 			continue;
2132 
2133 		spin_lock(&iocg->waitq.lock);
2134 
2135 		old_debt = iocg->abs_vdebt;
2136 		old_delay = iocg->delay;
2137 
2138 		if (iocg->abs_vdebt)
2139 			iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2140 		if (iocg->delay)
2141 			iocg->delay = iocg->delay >> nr_cycles ?: 1;
2142 
2143 		iocg_kick_waitq(iocg, true, now);
2144 
2145 		TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2146 				old_debt, iocg->abs_vdebt,
2147 				old_delay, iocg->delay);
2148 
2149 		spin_unlock(&iocg->waitq.lock);
2150 	}
2151 }
2152 
2153 /*
2154  * Check the active iocgs' state to avoid oversleeping and deactive
2155  * idle iocgs.
2156  *
2157  * Since waiters determine the sleep durations based on the vrate
2158  * they saw at the time of sleep, if vrate has increased, some
2159  * waiters could be sleeping for too long. Wake up tardy waiters
2160  * which should have woken up in the last period and expire idle
2161  * iocgs.
2162  */
2163 static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
2164 {
2165 	int nr_debtors = 0;
2166 	struct ioc_gq *iocg, *tiocg;
2167 
2168 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2169 		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2170 		    !iocg->delay && !iocg_is_idle(iocg))
2171 			continue;
2172 
2173 		spin_lock(&iocg->waitq.lock);
2174 
2175 		/* flush wait and indebt stat deltas */
2176 		if (iocg->wait_since) {
2177 			iocg->stat.wait_us += now->now - iocg->wait_since;
2178 			iocg->wait_since = now->now;
2179 		}
2180 		if (iocg->indebt_since) {
2181 			iocg->stat.indebt_us +=
2182 				now->now - iocg->indebt_since;
2183 			iocg->indebt_since = now->now;
2184 		}
2185 		if (iocg->indelay_since) {
2186 			iocg->stat.indelay_us +=
2187 				now->now - iocg->indelay_since;
2188 			iocg->indelay_since = now->now;
2189 		}
2190 
2191 		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2192 		    iocg->delay) {
2193 			/* might be oversleeping vtime / hweight changes, kick */
2194 			iocg_kick_waitq(iocg, true, now);
2195 			if (iocg->abs_vdebt || iocg->delay)
2196 				nr_debtors++;
2197 		} else if (iocg_is_idle(iocg)) {
2198 			/* no waiter and idle, deactivate */
2199 			u64 vtime = atomic64_read(&iocg->vtime);
2200 			s64 excess;
2201 
2202 			/*
2203 			 * @iocg has been inactive for a full duration and will
2204 			 * have a high budget. Account anything above target as
2205 			 * error and throw away. On reactivation, it'll start
2206 			 * with the target budget.
2207 			 */
2208 			excess = now->vnow - vtime - ioc->margins.target;
2209 			if (excess > 0) {
2210 				u32 old_hwi;
2211 
2212 				current_hweight(iocg, NULL, &old_hwi);
2213 				ioc->vtime_err -= div64_u64(excess * old_hwi,
2214 							    WEIGHT_ONE);
2215 			}
2216 
2217 			TRACE_IOCG_PATH(iocg_idle, iocg, now,
2218 					atomic64_read(&iocg->active_period),
2219 					atomic64_read(&ioc->cur_period), vtime);
2220 			__propagate_weights(iocg, 0, 0, false, now);
2221 			list_del_init(&iocg->active_list);
2222 		}
2223 
2224 		spin_unlock(&iocg->waitq.lock);
2225 	}
2226 
2227 	commit_weights(ioc);
2228 	return nr_debtors;
2229 }
2230 
2231 static void ioc_timer_fn(struct timer_list *timer)
2232 {
2233 	struct ioc *ioc = container_of(timer, struct ioc, timer);
2234 	struct ioc_gq *iocg, *tiocg;
2235 	struct ioc_now now;
2236 	LIST_HEAD(surpluses);
2237 	int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2238 	u64 usage_us_sum = 0;
2239 	u32 ppm_rthr;
2240 	u32 ppm_wthr;
2241 	u32 missed_ppm[2], rq_wait_pct;
2242 	u64 period_vtime;
2243 	int prev_busy_level;
2244 
2245 	/* how were the latencies during the period? */
2246 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2247 
2248 	/* take care of active iocgs */
2249 	spin_lock_irq(&ioc->lock);
2250 
2251 	ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2252 	ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2253 	ioc_now(ioc, &now);
2254 
2255 	period_vtime = now.vnow - ioc->period_at_vtime;
2256 	if (WARN_ON_ONCE(!period_vtime)) {
2257 		spin_unlock_irq(&ioc->lock);
2258 		return;
2259 	}
2260 
2261 	nr_debtors = ioc_check_iocgs(ioc, &now);
2262 
2263 	/*
2264 	 * Wait and indebt stat are flushed above and the donation calculation
2265 	 * below needs updated usage stat. Let's bring stat up-to-date.
2266 	 */
2267 	iocg_flush_stat(&ioc->active_iocgs, &now);
2268 
2269 	/* calc usage and see whether some weights need to be moved around */
2270 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2271 		u64 vdone, vtime, usage_us;
2272 		u32 hw_active, hw_inuse;
2273 
2274 		/*
2275 		 * Collect unused and wind vtime closer to vnow to prevent
2276 		 * iocgs from accumulating a large amount of budget.
2277 		 */
2278 		vdone = atomic64_read(&iocg->done_vtime);
2279 		vtime = atomic64_read(&iocg->vtime);
2280 		current_hweight(iocg, &hw_active, &hw_inuse);
2281 
2282 		/*
2283 		 * Latency QoS detection doesn't account for IOs which are
2284 		 * in-flight for longer than a period.  Detect them by
2285 		 * comparing vdone against period start.  If lagging behind
2286 		 * IOs from past periods, don't increase vrate.
2287 		 */
2288 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2289 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2290 		    time_after64(vtime, vdone) &&
2291 		    time_after64(vtime, now.vnow -
2292 				 MAX_LAGGING_PERIODS * period_vtime) &&
2293 		    time_before64(vdone, now.vnow - period_vtime))
2294 			nr_lagging++;
2295 
2296 		/*
2297 		 * Determine absolute usage factoring in in-flight IOs to avoid
2298 		 * high-latency completions appearing as idle.
2299 		 */
2300 		usage_us = iocg->usage_delta_us;
2301 		usage_us_sum += usage_us;
2302 
2303 		/* see whether there's surplus vtime */
2304 		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2305 		if (hw_inuse < hw_active ||
2306 		    (!waitqueue_active(&iocg->waitq) &&
2307 		     time_before64(vtime, now.vnow - ioc->margins.low))) {
2308 			u32 hwa, old_hwi, hwm, new_hwi, usage;
2309 			u64 usage_dur;
2310 
2311 			if (vdone != vtime) {
2312 				u64 inflight_us = DIV64_U64_ROUND_UP(
2313 					cost_to_abs_cost(vtime - vdone, hw_inuse),
2314 					ioc->vtime_base_rate);
2315 
2316 				usage_us = max(usage_us, inflight_us);
2317 			}
2318 
2319 			/* convert to hweight based usage ratio */
2320 			if (time_after64(iocg->activated_at, ioc->period_at))
2321 				usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2322 			else
2323 				usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2324 
2325 			usage = clamp_t(u32,
2326 				DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2327 						   usage_dur),
2328 				1, WEIGHT_ONE);
2329 
2330 			/*
2331 			 * Already donating or accumulated enough to start.
2332 			 * Determine the donation amount.
2333 			 */
2334 			current_hweight(iocg, &hwa, &old_hwi);
2335 			hwm = current_hweight_max(iocg);
2336 			new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2337 							 usage, &now);
2338 			/*
2339 			 * Donation calculation assumes hweight_after_donation
2340 			 * to be positive, a condition that a donor w/ hwa < 2
2341 			 * can't meet. Don't bother with donation if hwa is
2342 			 * below 2. It's not gonna make a meaningful difference
2343 			 * anyway.
2344 			 */
2345 			if (new_hwi < hwm && hwa >= 2) {
2346 				iocg->hweight_donating = hwa;
2347 				iocg->hweight_after_donation = new_hwi;
2348 				list_add(&iocg->surplus_list, &surpluses);
2349 			} else if (!iocg->abs_vdebt) {
2350 				/*
2351 				 * @iocg doesn't have enough to donate. Reset
2352 				 * its inuse to active.
2353 				 *
2354 				 * Don't reset debtors as their inuse's are
2355 				 * owned by debt handling. This shouldn't affect
2356 				 * donation calculuation in any meaningful way
2357 				 * as @iocg doesn't have a meaningful amount of
2358 				 * share anyway.
2359 				 */
2360 				TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2361 						iocg->inuse, iocg->active,
2362 						iocg->hweight_inuse, new_hwi);
2363 
2364 				__propagate_weights(iocg, iocg->active,
2365 						    iocg->active, true, &now);
2366 				nr_shortages++;
2367 			}
2368 		} else {
2369 			/* genuinely short on vtime */
2370 			nr_shortages++;
2371 		}
2372 	}
2373 
2374 	if (!list_empty(&surpluses) && nr_shortages)
2375 		transfer_surpluses(&surpluses, &now);
2376 
2377 	commit_weights(ioc);
2378 
2379 	/* surplus list should be dissolved after use */
2380 	list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2381 		list_del_init(&iocg->surplus_list);
2382 
2383 	/*
2384 	 * If q is getting clogged or we're missing too much, we're issuing
2385 	 * too much IO and should lower vtime rate.  If we're not missing
2386 	 * and experiencing shortages but not surpluses, we're too stingy
2387 	 * and should increase vtime rate.
2388 	 */
2389 	prev_busy_level = ioc->busy_level;
2390 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2391 	    missed_ppm[READ] > ppm_rthr ||
2392 	    missed_ppm[WRITE] > ppm_wthr) {
2393 		/* clearly missing QoS targets, slow down vrate */
2394 		ioc->busy_level = max(ioc->busy_level, 0);
2395 		ioc->busy_level++;
2396 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2397 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2398 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2399 		/* QoS targets are being met with >25% margin */
2400 		if (nr_shortages) {
2401 			/*
2402 			 * We're throttling while the device has spare
2403 			 * capacity.  If vrate was being slowed down, stop.
2404 			 */
2405 			ioc->busy_level = min(ioc->busy_level, 0);
2406 
2407 			/*
2408 			 * If there are IOs spanning multiple periods, wait
2409 			 * them out before pushing the device harder.
2410 			 */
2411 			if (!nr_lagging)
2412 				ioc->busy_level--;
2413 		} else {
2414 			/*
2415 			 * Nobody is being throttled and the users aren't
2416 			 * issuing enough IOs to saturate the device.  We
2417 			 * simply don't know how close the device is to
2418 			 * saturation.  Coast.
2419 			 */
2420 			ioc->busy_level = 0;
2421 		}
2422 	} else {
2423 		/* inside the hysterisis margin, we're good */
2424 		ioc->busy_level = 0;
2425 	}
2426 
2427 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2428 
2429 	ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2430 			      prev_busy_level, missed_ppm);
2431 
2432 	ioc_refresh_params(ioc, false);
2433 
2434 	ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2435 
2436 	/*
2437 	 * This period is done.  Move onto the next one.  If nothing's
2438 	 * going on with the device, stop the timer.
2439 	 */
2440 	atomic64_inc(&ioc->cur_period);
2441 
2442 	if (ioc->running != IOC_STOP) {
2443 		if (!list_empty(&ioc->active_iocgs)) {
2444 			ioc_start_period(ioc, &now);
2445 		} else {
2446 			ioc->busy_level = 0;
2447 			ioc->vtime_err = 0;
2448 			ioc->running = IOC_IDLE;
2449 		}
2450 
2451 		ioc_refresh_vrate(ioc, &now);
2452 	}
2453 
2454 	spin_unlock_irq(&ioc->lock);
2455 }
2456 
2457 static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2458 				      u64 abs_cost, struct ioc_now *now)
2459 {
2460 	struct ioc *ioc = iocg->ioc;
2461 	struct ioc_margins *margins = &ioc->margins;
2462 	u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2463 	u32 hwi, adj_step;
2464 	s64 margin;
2465 	u64 cost, new_inuse;
2466 	unsigned long flags;
2467 
2468 	current_hweight(iocg, NULL, &hwi);
2469 	old_hwi = hwi;
2470 	cost = abs_cost_to_cost(abs_cost, hwi);
2471 	margin = now->vnow - vtime - cost;
2472 
2473 	/* debt handling owns inuse for debtors */
2474 	if (iocg->abs_vdebt)
2475 		return cost;
2476 
2477 	/*
2478 	 * We only increase inuse during period and do so if the margin has
2479 	 * deteriorated since the previous adjustment.
2480 	 */
2481 	if (margin >= iocg->saved_margin || margin >= margins->low ||
2482 	    iocg->inuse == iocg->active)
2483 		return cost;
2484 
2485 	spin_lock_irqsave(&ioc->lock, flags);
2486 
2487 	/* we own inuse only when @iocg is in the normal active state */
2488 	if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2489 		spin_unlock_irqrestore(&ioc->lock, flags);
2490 		return cost;
2491 	}
2492 
2493 	/*
2494 	 * Bump up inuse till @abs_cost fits in the existing budget.
2495 	 * adj_step must be determined after acquiring ioc->lock - we might
2496 	 * have raced and lost to another thread for activation and could
2497 	 * be reading 0 iocg->active before ioc->lock which will lead to
2498 	 * infinite loop.
2499 	 */
2500 	new_inuse = iocg->inuse;
2501 	adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2502 	do {
2503 		new_inuse = new_inuse + adj_step;
2504 		propagate_weights(iocg, iocg->active, new_inuse, true, now);
2505 		current_hweight(iocg, NULL, &hwi);
2506 		cost = abs_cost_to_cost(abs_cost, hwi);
2507 	} while (time_after64(vtime + cost, now->vnow) &&
2508 		 iocg->inuse != iocg->active);
2509 
2510 	spin_unlock_irqrestore(&ioc->lock, flags);
2511 
2512 	TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2513 			old_inuse, iocg->inuse, old_hwi, hwi);
2514 
2515 	return cost;
2516 }
2517 
2518 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2519 				    bool is_merge, u64 *costp)
2520 {
2521 	struct ioc *ioc = iocg->ioc;
2522 	u64 coef_seqio, coef_randio, coef_page;
2523 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2524 	u64 seek_pages = 0;
2525 	u64 cost = 0;
2526 
2527 	/* Can't calculate cost for empty bio */
2528 	if (!bio->bi_iter.bi_size)
2529 		goto out;
2530 
2531 	switch (bio_op(bio)) {
2532 	case REQ_OP_READ:
2533 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
2534 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
2535 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
2536 		break;
2537 	case REQ_OP_WRITE:
2538 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
2539 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
2540 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
2541 		break;
2542 	default:
2543 		goto out;
2544 	}
2545 
2546 	if (iocg->cursor) {
2547 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2548 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2549 	}
2550 
2551 	if (!is_merge) {
2552 		if (seek_pages > LCOEF_RANDIO_PAGES) {
2553 			cost += coef_randio;
2554 		} else {
2555 			cost += coef_seqio;
2556 		}
2557 	}
2558 	cost += pages * coef_page;
2559 out:
2560 	*costp = cost;
2561 }
2562 
2563 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2564 {
2565 	u64 cost;
2566 
2567 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2568 	return cost;
2569 }
2570 
2571 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2572 					 u64 *costp)
2573 {
2574 	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2575 
2576 	switch (req_op(rq)) {
2577 	case REQ_OP_READ:
2578 		*costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2579 		break;
2580 	case REQ_OP_WRITE:
2581 		*costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2582 		break;
2583 	default:
2584 		*costp = 0;
2585 	}
2586 }
2587 
2588 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2589 {
2590 	u64 cost;
2591 
2592 	calc_size_vtime_cost_builtin(rq, ioc, &cost);
2593 	return cost;
2594 }
2595 
2596 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2597 {
2598 	struct blkcg_gq *blkg = bio->bi_blkg;
2599 	struct ioc *ioc = rqos_to_ioc(rqos);
2600 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
2601 	struct ioc_now now;
2602 	struct iocg_wait wait;
2603 	u64 abs_cost, cost, vtime;
2604 	bool use_debt, ioc_locked;
2605 	unsigned long flags;
2606 
2607 	/* bypass IOs if disabled, still initializing, or for root cgroup */
2608 	if (!ioc->enabled || !iocg || !iocg->level)
2609 		return;
2610 
2611 	/* calculate the absolute vtime cost */
2612 	abs_cost = calc_vtime_cost(bio, iocg, false);
2613 	if (!abs_cost)
2614 		return;
2615 
2616 	if (!iocg_activate(iocg, &now))
2617 		return;
2618 
2619 	iocg->cursor = bio_end_sector(bio);
2620 	vtime = atomic64_read(&iocg->vtime);
2621 	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2622 
2623 	/*
2624 	 * If no one's waiting and within budget, issue right away.  The
2625 	 * tests are racy but the races aren't systemic - we only miss once
2626 	 * in a while which is fine.
2627 	 */
2628 	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2629 	    time_before_eq64(vtime + cost, now.vnow)) {
2630 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2631 		return;
2632 	}
2633 
2634 	/*
2635 	 * We're over budget. This can be handled in two ways. IOs which may
2636 	 * cause priority inversions are punted to @ioc->aux_iocg and charged as
2637 	 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2638 	 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2639 	 * whether debt handling is needed and acquire locks accordingly.
2640 	 */
2641 	use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2642 	ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2643 retry_lock:
2644 	iocg_lock(iocg, ioc_locked, &flags);
2645 
2646 	/*
2647 	 * @iocg must stay activated for debt and waitq handling. Deactivation
2648 	 * is synchronized against both ioc->lock and waitq.lock and we won't
2649 	 * get deactivated as long as we're waiting or has debt, so we're good
2650 	 * if we're activated here. In the unlikely cases that we aren't, just
2651 	 * issue the IO.
2652 	 */
2653 	if (unlikely(list_empty(&iocg->active_list))) {
2654 		iocg_unlock(iocg, ioc_locked, &flags);
2655 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2656 		return;
2657 	}
2658 
2659 	/*
2660 	 * We're over budget. If @bio has to be issued regardless, remember
2661 	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2662 	 * off the debt before waking more IOs.
2663 	 *
2664 	 * This way, the debt is continuously paid off each period with the
2665 	 * actual budget available to the cgroup. If we just wound vtime, we
2666 	 * would incorrectly use the current hw_inuse for the entire amount
2667 	 * which, for example, can lead to the cgroup staying blocked for a
2668 	 * long time even with substantially raised hw_inuse.
2669 	 *
2670 	 * An iocg with vdebt should stay online so that the timer can keep
2671 	 * deducting its vdebt and [de]activate use_delay mechanism
2672 	 * accordingly. We don't want to race against the timer trying to
2673 	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
2674 	 * penalizing the cgroup and its descendants.
2675 	 */
2676 	if (use_debt) {
2677 		iocg_incur_debt(iocg, abs_cost, &now);
2678 		if (iocg_kick_delay(iocg, &now))
2679 			blkcg_schedule_throttle(rqos->disk,
2680 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2681 		iocg_unlock(iocg, ioc_locked, &flags);
2682 		return;
2683 	}
2684 
2685 	/* guarantee that iocgs w/ waiters have maximum inuse */
2686 	if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2687 		if (!ioc_locked) {
2688 			iocg_unlock(iocg, false, &flags);
2689 			ioc_locked = true;
2690 			goto retry_lock;
2691 		}
2692 		propagate_weights(iocg, iocg->active, iocg->active, true,
2693 				  &now);
2694 	}
2695 
2696 	/*
2697 	 * Append self to the waitq and schedule the wakeup timer if we're
2698 	 * the first waiter.  The timer duration is calculated based on the
2699 	 * current vrate.  vtime and hweight changes can make it too short
2700 	 * or too long.  Each wait entry records the absolute cost it's
2701 	 * waiting for to allow re-evaluation using a custom wait entry.
2702 	 *
2703 	 * If too short, the timer simply reschedules itself.  If too long,
2704 	 * the period timer will notice and trigger wakeups.
2705 	 *
2706 	 * All waiters are on iocg->waitq and the wait states are
2707 	 * synchronized using waitq.lock.
2708 	 */
2709 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2710 	wait.wait.private = current;
2711 	wait.bio = bio;
2712 	wait.abs_cost = abs_cost;
2713 	wait.committed = false;	/* will be set true by waker */
2714 
2715 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2716 	iocg_kick_waitq(iocg, ioc_locked, &now);
2717 
2718 	iocg_unlock(iocg, ioc_locked, &flags);
2719 
2720 	while (true) {
2721 		set_current_state(TASK_UNINTERRUPTIBLE);
2722 		if (wait.committed)
2723 			break;
2724 		io_schedule();
2725 	}
2726 
2727 	/* waker already committed us, proceed */
2728 	finish_wait(&iocg->waitq, &wait.wait);
2729 }
2730 
2731 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2732 			   struct bio *bio)
2733 {
2734 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2735 	struct ioc *ioc = rqos_to_ioc(rqos);
2736 	sector_t bio_end = bio_end_sector(bio);
2737 	struct ioc_now now;
2738 	u64 vtime, abs_cost, cost;
2739 	unsigned long flags;
2740 
2741 	/* bypass if disabled, still initializing, or for root cgroup */
2742 	if (!ioc->enabled || !iocg || !iocg->level)
2743 		return;
2744 
2745 	abs_cost = calc_vtime_cost(bio, iocg, true);
2746 	if (!abs_cost)
2747 		return;
2748 
2749 	ioc_now(ioc, &now);
2750 
2751 	vtime = atomic64_read(&iocg->vtime);
2752 	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2753 
2754 	/* update cursor if backmerging into the request at the cursor */
2755 	if (blk_rq_pos(rq) < bio_end &&
2756 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2757 		iocg->cursor = bio_end;
2758 
2759 	/*
2760 	 * Charge if there's enough vtime budget and the existing request has
2761 	 * cost assigned.
2762 	 */
2763 	if (rq->bio && rq->bio->bi_iocost_cost &&
2764 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2765 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2766 		return;
2767 	}
2768 
2769 	/*
2770 	 * Otherwise, account it as debt if @iocg is online, which it should
2771 	 * be for the vast majority of cases. See debt handling in
2772 	 * ioc_rqos_throttle() for details.
2773 	 */
2774 	spin_lock_irqsave(&ioc->lock, flags);
2775 	spin_lock(&iocg->waitq.lock);
2776 
2777 	if (likely(!list_empty(&iocg->active_list))) {
2778 		iocg_incur_debt(iocg, abs_cost, &now);
2779 		if (iocg_kick_delay(iocg, &now))
2780 			blkcg_schedule_throttle(rqos->disk,
2781 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2782 	} else {
2783 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2784 	}
2785 
2786 	spin_unlock(&iocg->waitq.lock);
2787 	spin_unlock_irqrestore(&ioc->lock, flags);
2788 }
2789 
2790 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2791 {
2792 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2793 
2794 	if (iocg && bio->bi_iocost_cost)
2795 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2796 }
2797 
2798 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2799 {
2800 	struct ioc *ioc = rqos_to_ioc(rqos);
2801 	struct ioc_pcpu_stat *ccs;
2802 	u64 on_q_ns, rq_wait_ns, size_nsec;
2803 	int pidx, rw;
2804 
2805 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2806 		return;
2807 
2808 	switch (req_op(rq)) {
2809 	case REQ_OP_READ:
2810 		pidx = QOS_RLAT;
2811 		rw = READ;
2812 		break;
2813 	case REQ_OP_WRITE:
2814 		pidx = QOS_WLAT;
2815 		rw = WRITE;
2816 		break;
2817 	default:
2818 		return;
2819 	}
2820 
2821 	on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
2822 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2823 	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2824 
2825 	ccs = get_cpu_ptr(ioc->pcpu_stat);
2826 
2827 	if (on_q_ns <= size_nsec ||
2828 	    on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2829 		local_inc(&ccs->missed[rw].nr_met);
2830 	else
2831 		local_inc(&ccs->missed[rw].nr_missed);
2832 
2833 	local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2834 
2835 	put_cpu_ptr(ccs);
2836 }
2837 
2838 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2839 {
2840 	struct ioc *ioc = rqos_to_ioc(rqos);
2841 
2842 	spin_lock_irq(&ioc->lock);
2843 	ioc_refresh_params(ioc, false);
2844 	spin_unlock_irq(&ioc->lock);
2845 }
2846 
2847 static void ioc_rqos_exit(struct rq_qos *rqos)
2848 {
2849 	struct ioc *ioc = rqos_to_ioc(rqos);
2850 
2851 	blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
2852 
2853 	spin_lock_irq(&ioc->lock);
2854 	ioc->running = IOC_STOP;
2855 	spin_unlock_irq(&ioc->lock);
2856 
2857 	timer_shutdown_sync(&ioc->timer);
2858 	free_percpu(ioc->pcpu_stat);
2859 	kfree(ioc);
2860 }
2861 
2862 static const struct rq_qos_ops ioc_rqos_ops = {
2863 	.throttle = ioc_rqos_throttle,
2864 	.merge = ioc_rqos_merge,
2865 	.done_bio = ioc_rqos_done_bio,
2866 	.done = ioc_rqos_done,
2867 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
2868 	.exit = ioc_rqos_exit,
2869 };
2870 
2871 static int blk_iocost_init(struct gendisk *disk)
2872 {
2873 	struct ioc *ioc;
2874 	int i, cpu, ret;
2875 
2876 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2877 	if (!ioc)
2878 		return -ENOMEM;
2879 
2880 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2881 	if (!ioc->pcpu_stat) {
2882 		kfree(ioc);
2883 		return -ENOMEM;
2884 	}
2885 
2886 	for_each_possible_cpu(cpu) {
2887 		struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2888 
2889 		for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2890 			local_set(&ccs->missed[i].nr_met, 0);
2891 			local_set(&ccs->missed[i].nr_missed, 0);
2892 		}
2893 		local64_set(&ccs->rq_wait_ns, 0);
2894 	}
2895 
2896 	spin_lock_init(&ioc->lock);
2897 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
2898 	INIT_LIST_HEAD(&ioc->active_iocgs);
2899 
2900 	ioc->running = IOC_IDLE;
2901 	ioc->vtime_base_rate = VTIME_PER_USEC;
2902 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2903 	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2904 	ioc->period_at = ktime_to_us(blk_time_get());
2905 	atomic64_set(&ioc->cur_period, 0);
2906 	atomic_set(&ioc->hweight_gen, 0);
2907 
2908 	spin_lock_irq(&ioc->lock);
2909 	ioc->autop_idx = AUTOP_INVALID;
2910 	ioc_refresh_params_disk(ioc, true, disk);
2911 	spin_unlock_irq(&ioc->lock);
2912 
2913 	/*
2914 	 * rqos must be added before activation to allow ioc_pd_init() to
2915 	 * lookup the ioc from q. This means that the rqos methods may get
2916 	 * called before policy activation completion, can't assume that the
2917 	 * target bio has an iocg associated and need to test for NULL iocg.
2918 	 */
2919 	ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
2920 	if (ret)
2921 		goto err_free_ioc;
2922 
2923 	ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
2924 	if (ret)
2925 		goto err_del_qos;
2926 	return 0;
2927 
2928 err_del_qos:
2929 	rq_qos_del(&ioc->rqos);
2930 err_free_ioc:
2931 	free_percpu(ioc->pcpu_stat);
2932 	kfree(ioc);
2933 	return ret;
2934 }
2935 
2936 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2937 {
2938 	struct ioc_cgrp *iocc;
2939 
2940 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2941 	if (!iocc)
2942 		return NULL;
2943 
2944 	iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2945 	return &iocc->cpd;
2946 }
2947 
2948 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2949 {
2950 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
2951 }
2952 
2953 static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
2954 		struct blkcg *blkcg, gfp_t gfp)
2955 {
2956 	int levels = blkcg->css.cgroup->level + 1;
2957 	struct ioc_gq *iocg;
2958 
2959 	iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp,
2960 			    disk->node_id);
2961 	if (!iocg)
2962 		return NULL;
2963 
2964 	iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2965 	if (!iocg->pcpu_stat) {
2966 		kfree(iocg);
2967 		return NULL;
2968 	}
2969 
2970 	return &iocg->pd;
2971 }
2972 
2973 static void ioc_pd_init(struct blkg_policy_data *pd)
2974 {
2975 	struct ioc_gq *iocg = pd_to_iocg(pd);
2976 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2977 	struct ioc *ioc = q_to_ioc(blkg->q);
2978 	struct ioc_now now;
2979 	struct blkcg_gq *tblkg;
2980 	unsigned long flags;
2981 
2982 	ioc_now(ioc, &now);
2983 
2984 	iocg->ioc = ioc;
2985 	atomic64_set(&iocg->vtime, now.vnow);
2986 	atomic64_set(&iocg->done_vtime, now.vnow);
2987 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2988 	INIT_LIST_HEAD(&iocg->active_list);
2989 	INIT_LIST_HEAD(&iocg->walk_list);
2990 	INIT_LIST_HEAD(&iocg->surplus_list);
2991 	iocg->hweight_active = WEIGHT_ONE;
2992 	iocg->hweight_inuse = WEIGHT_ONE;
2993 
2994 	init_waitqueue_head(&iocg->waitq);
2995 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2996 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2997 
2998 	iocg->level = blkg->blkcg->css.cgroup->level;
2999 
3000 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
3001 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
3002 		iocg->ancestors[tiocg->level] = tiocg;
3003 	}
3004 
3005 	spin_lock_irqsave(&ioc->lock, flags);
3006 	weight_updated(iocg, &now);
3007 	spin_unlock_irqrestore(&ioc->lock, flags);
3008 }
3009 
3010 static void ioc_pd_free(struct blkg_policy_data *pd)
3011 {
3012 	struct ioc_gq *iocg = pd_to_iocg(pd);
3013 	struct ioc *ioc = iocg->ioc;
3014 	unsigned long flags;
3015 
3016 	if (ioc) {
3017 		spin_lock_irqsave(&ioc->lock, flags);
3018 
3019 		if (!list_empty(&iocg->active_list)) {
3020 			struct ioc_now now;
3021 
3022 			ioc_now(ioc, &now);
3023 			propagate_weights(iocg, 0, 0, false, &now);
3024 			list_del_init(&iocg->active_list);
3025 		}
3026 
3027 		WARN_ON_ONCE(!list_empty(&iocg->walk_list));
3028 		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
3029 
3030 		spin_unlock_irqrestore(&ioc->lock, flags);
3031 
3032 		hrtimer_cancel(&iocg->waitq_timer);
3033 	}
3034 	free_percpu(iocg->pcpu_stat);
3035 	kfree(iocg);
3036 }
3037 
3038 static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
3039 {
3040 	struct ioc_gq *iocg = pd_to_iocg(pd);
3041 	struct ioc *ioc = iocg->ioc;
3042 
3043 	if (!ioc->enabled)
3044 		return;
3045 
3046 	if (iocg->level == 0) {
3047 		unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
3048 			ioc->vtime_base_rate * 10000,
3049 			VTIME_PER_USEC);
3050 		seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
3051 	}
3052 
3053 	seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
3054 
3055 	if (blkcg_debug_stats)
3056 		seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3057 			iocg->last_stat.wait_us,
3058 			iocg->last_stat.indebt_us,
3059 			iocg->last_stat.indelay_us);
3060 }
3061 
3062 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3063 			     int off)
3064 {
3065 	const char *dname = blkg_dev_name(pd->blkg);
3066 	struct ioc_gq *iocg = pd_to_iocg(pd);
3067 
3068 	if (dname && iocg->cfg_weight)
3069 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
3070 	return 0;
3071 }
3072 
3073 
3074 static int ioc_weight_show(struct seq_file *sf, void *v)
3075 {
3076 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3077 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3078 
3079 	seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3080 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3081 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3082 	return 0;
3083 }
3084 
3085 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3086 				size_t nbytes, loff_t off)
3087 {
3088 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
3089 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3090 	struct blkg_conf_ctx ctx;
3091 	struct ioc_now now;
3092 	struct ioc_gq *iocg;
3093 	u32 v;
3094 	int ret;
3095 
3096 	if (!strchr(buf, ':')) {
3097 		struct blkcg_gq *blkg;
3098 
3099 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3100 			return -EINVAL;
3101 
3102 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3103 			return -EINVAL;
3104 
3105 		spin_lock_irq(&blkcg->lock);
3106 		iocc->dfl_weight = v * WEIGHT_ONE;
3107 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3108 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
3109 
3110 			if (iocg) {
3111 				spin_lock(&iocg->ioc->lock);
3112 				ioc_now(iocg->ioc, &now);
3113 				weight_updated(iocg, &now);
3114 				spin_unlock(&iocg->ioc->lock);
3115 			}
3116 		}
3117 		spin_unlock_irq(&blkcg->lock);
3118 
3119 		return nbytes;
3120 	}
3121 
3122 	blkg_conf_init(&ctx, buf);
3123 
3124 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
3125 	if (ret)
3126 		goto err;
3127 
3128 	iocg = blkg_to_iocg(ctx.blkg);
3129 
3130 	if (!strncmp(ctx.body, "default", 7)) {
3131 		v = 0;
3132 	} else {
3133 		if (!sscanf(ctx.body, "%u", &v))
3134 			goto einval;
3135 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3136 			goto einval;
3137 	}
3138 
3139 	spin_lock(&iocg->ioc->lock);
3140 	iocg->cfg_weight = v * WEIGHT_ONE;
3141 	ioc_now(iocg->ioc, &now);
3142 	weight_updated(iocg, &now);
3143 	spin_unlock(&iocg->ioc->lock);
3144 
3145 	blkg_conf_exit(&ctx);
3146 	return nbytes;
3147 
3148 einval:
3149 	ret = -EINVAL;
3150 err:
3151 	blkg_conf_exit(&ctx);
3152 	return ret;
3153 }
3154 
3155 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3156 			  int off)
3157 {
3158 	const char *dname = blkg_dev_name(pd->blkg);
3159 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
3160 
3161 	if (!dname)
3162 		return 0;
3163 
3164 	spin_lock_irq(&ioc->lock);
3165 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3166 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3167 		   ioc->params.qos[QOS_RPPM] / 10000,
3168 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
3169 		   ioc->params.qos[QOS_RLAT],
3170 		   ioc->params.qos[QOS_WPPM] / 10000,
3171 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
3172 		   ioc->params.qos[QOS_WLAT],
3173 		   ioc->params.qos[QOS_MIN] / 10000,
3174 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
3175 		   ioc->params.qos[QOS_MAX] / 10000,
3176 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
3177 	spin_unlock_irq(&ioc->lock);
3178 	return 0;
3179 }
3180 
3181 static int ioc_qos_show(struct seq_file *sf, void *v)
3182 {
3183 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3184 
3185 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3186 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3187 	return 0;
3188 }
3189 
3190 static const match_table_t qos_ctrl_tokens = {
3191 	{ QOS_ENABLE,		"enable=%u"	},
3192 	{ QOS_CTRL,		"ctrl=%s"	},
3193 	{ NR_QOS_CTRL_PARAMS,	NULL		},
3194 };
3195 
3196 static const match_table_t qos_tokens = {
3197 	{ QOS_RPPM,		"rpct=%s"	},
3198 	{ QOS_RLAT,		"rlat=%u"	},
3199 	{ QOS_WPPM,		"wpct=%s"	},
3200 	{ QOS_WLAT,		"wlat=%u"	},
3201 	{ QOS_MIN,		"min=%s"	},
3202 	{ QOS_MAX,		"max=%s"	},
3203 	{ NR_QOS_PARAMS,	NULL		},
3204 };
3205 
3206 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3207 			     size_t nbytes, loff_t off)
3208 {
3209 	struct blkg_conf_ctx ctx;
3210 	struct gendisk *disk;
3211 	struct ioc *ioc;
3212 	u32 qos[NR_QOS_PARAMS];
3213 	bool enable, user;
3214 	char *body, *p;
3215 	int ret;
3216 
3217 	blkg_conf_init(&ctx, input);
3218 
3219 	ret = blkg_conf_open_bdev(&ctx);
3220 	if (ret)
3221 		goto err;
3222 
3223 	body = ctx.body;
3224 	disk = ctx.bdev->bd_disk;
3225 	if (!queue_is_mq(disk->queue)) {
3226 		ret = -EOPNOTSUPP;
3227 		goto err;
3228 	}
3229 
3230 	ioc = q_to_ioc(disk->queue);
3231 	if (!ioc) {
3232 		ret = blk_iocost_init(disk);
3233 		if (ret)
3234 			goto err;
3235 		ioc = q_to_ioc(disk->queue);
3236 	}
3237 
3238 	blk_mq_freeze_queue(disk->queue);
3239 	blk_mq_quiesce_queue(disk->queue);
3240 
3241 	spin_lock_irq(&ioc->lock);
3242 	memcpy(qos, ioc->params.qos, sizeof(qos));
3243 	enable = ioc->enabled;
3244 	user = ioc->user_qos_params;
3245 
3246 	while ((p = strsep(&body, " \t\n"))) {
3247 		substring_t args[MAX_OPT_ARGS];
3248 		char buf[32];
3249 		int tok;
3250 		s64 v;
3251 
3252 		if (!*p)
3253 			continue;
3254 
3255 		switch (match_token(p, qos_ctrl_tokens, args)) {
3256 		case QOS_ENABLE:
3257 			if (match_u64(&args[0], &v))
3258 				goto einval;
3259 			enable = v;
3260 			continue;
3261 		case QOS_CTRL:
3262 			match_strlcpy(buf, &args[0], sizeof(buf));
3263 			if (!strcmp(buf, "auto"))
3264 				user = false;
3265 			else if (!strcmp(buf, "user"))
3266 				user = true;
3267 			else
3268 				goto einval;
3269 			continue;
3270 		}
3271 
3272 		tok = match_token(p, qos_tokens, args);
3273 		switch (tok) {
3274 		case QOS_RPPM:
3275 		case QOS_WPPM:
3276 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3277 			    sizeof(buf))
3278 				goto einval;
3279 			if (cgroup_parse_float(buf, 2, &v))
3280 				goto einval;
3281 			if (v < 0 || v > 10000)
3282 				goto einval;
3283 			qos[tok] = v * 100;
3284 			break;
3285 		case QOS_RLAT:
3286 		case QOS_WLAT:
3287 			if (match_u64(&args[0], &v))
3288 				goto einval;
3289 			qos[tok] = v;
3290 			break;
3291 		case QOS_MIN:
3292 		case QOS_MAX:
3293 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3294 			    sizeof(buf))
3295 				goto einval;
3296 			if (cgroup_parse_float(buf, 2, &v))
3297 				goto einval;
3298 			if (v < 0)
3299 				goto einval;
3300 			qos[tok] = clamp_t(s64, v * 100,
3301 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
3302 			break;
3303 		default:
3304 			goto einval;
3305 		}
3306 		user = true;
3307 	}
3308 
3309 	if (qos[QOS_MIN] > qos[QOS_MAX])
3310 		goto einval;
3311 
3312 	if (enable && !ioc->enabled) {
3313 		blk_stat_enable_accounting(disk->queue);
3314 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
3315 		ioc->enabled = true;
3316 	} else if (!enable && ioc->enabled) {
3317 		blk_stat_disable_accounting(disk->queue);
3318 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
3319 		ioc->enabled = false;
3320 	}
3321 
3322 	if (user) {
3323 		memcpy(ioc->params.qos, qos, sizeof(qos));
3324 		ioc->user_qos_params = true;
3325 	} else {
3326 		ioc->user_qos_params = false;
3327 	}
3328 
3329 	ioc_refresh_params(ioc, true);
3330 	spin_unlock_irq(&ioc->lock);
3331 
3332 	if (enable)
3333 		wbt_disable_default(disk);
3334 	else
3335 		wbt_enable_default(disk);
3336 
3337 	blk_mq_unquiesce_queue(disk->queue);
3338 	blk_mq_unfreeze_queue(disk->queue);
3339 
3340 	blkg_conf_exit(&ctx);
3341 	return nbytes;
3342 einval:
3343 	spin_unlock_irq(&ioc->lock);
3344 
3345 	blk_mq_unquiesce_queue(disk->queue);
3346 	blk_mq_unfreeze_queue(disk->queue);
3347 
3348 	ret = -EINVAL;
3349 err:
3350 	blkg_conf_exit(&ctx);
3351 	return ret;
3352 }
3353 
3354 static u64 ioc_cost_model_prfill(struct seq_file *sf,
3355 				 struct blkg_policy_data *pd, int off)
3356 {
3357 	const char *dname = blkg_dev_name(pd->blkg);
3358 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
3359 	u64 *u = ioc->params.i_lcoefs;
3360 
3361 	if (!dname)
3362 		return 0;
3363 
3364 	spin_lock_irq(&ioc->lock);
3365 	seq_printf(sf, "%s ctrl=%s model=linear "
3366 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
3367 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3368 		   dname, ioc->user_cost_model ? "user" : "auto",
3369 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3370 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3371 	spin_unlock_irq(&ioc->lock);
3372 	return 0;
3373 }
3374 
3375 static int ioc_cost_model_show(struct seq_file *sf, void *v)
3376 {
3377 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3378 
3379 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3380 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3381 	return 0;
3382 }
3383 
3384 static const match_table_t cost_ctrl_tokens = {
3385 	{ COST_CTRL,		"ctrl=%s"	},
3386 	{ COST_MODEL,		"model=%s"	},
3387 	{ NR_COST_CTRL_PARAMS,	NULL		},
3388 };
3389 
3390 static const match_table_t i_lcoef_tokens = {
3391 	{ I_LCOEF_RBPS,		"rbps=%u"	},
3392 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
3393 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
3394 	{ I_LCOEF_WBPS,		"wbps=%u"	},
3395 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
3396 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
3397 	{ NR_I_LCOEFS,		NULL		},
3398 };
3399 
3400 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3401 				    size_t nbytes, loff_t off)
3402 {
3403 	struct blkg_conf_ctx ctx;
3404 	struct request_queue *q;
3405 	struct ioc *ioc;
3406 	u64 u[NR_I_LCOEFS];
3407 	bool user;
3408 	char *body, *p;
3409 	int ret;
3410 
3411 	blkg_conf_init(&ctx, input);
3412 
3413 	ret = blkg_conf_open_bdev(&ctx);
3414 	if (ret)
3415 		goto err;
3416 
3417 	body = ctx.body;
3418 	q = bdev_get_queue(ctx.bdev);
3419 	if (!queue_is_mq(q)) {
3420 		ret = -EOPNOTSUPP;
3421 		goto err;
3422 	}
3423 
3424 	ioc = q_to_ioc(q);
3425 	if (!ioc) {
3426 		ret = blk_iocost_init(ctx.bdev->bd_disk);
3427 		if (ret)
3428 			goto err;
3429 		ioc = q_to_ioc(q);
3430 	}
3431 
3432 	blk_mq_freeze_queue(q);
3433 	blk_mq_quiesce_queue(q);
3434 
3435 	spin_lock_irq(&ioc->lock);
3436 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3437 	user = ioc->user_cost_model;
3438 
3439 	while ((p = strsep(&body, " \t\n"))) {
3440 		substring_t args[MAX_OPT_ARGS];
3441 		char buf[32];
3442 		int tok;
3443 		u64 v;
3444 
3445 		if (!*p)
3446 			continue;
3447 
3448 		switch (match_token(p, cost_ctrl_tokens, args)) {
3449 		case COST_CTRL:
3450 			match_strlcpy(buf, &args[0], sizeof(buf));
3451 			if (!strcmp(buf, "auto"))
3452 				user = false;
3453 			else if (!strcmp(buf, "user"))
3454 				user = true;
3455 			else
3456 				goto einval;
3457 			continue;
3458 		case COST_MODEL:
3459 			match_strlcpy(buf, &args[0], sizeof(buf));
3460 			if (strcmp(buf, "linear"))
3461 				goto einval;
3462 			continue;
3463 		}
3464 
3465 		tok = match_token(p, i_lcoef_tokens, args);
3466 		if (tok == NR_I_LCOEFS)
3467 			goto einval;
3468 		if (match_u64(&args[0], &v))
3469 			goto einval;
3470 		u[tok] = v;
3471 		user = true;
3472 	}
3473 
3474 	if (user) {
3475 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3476 		ioc->user_cost_model = true;
3477 	} else {
3478 		ioc->user_cost_model = false;
3479 	}
3480 	ioc_refresh_params(ioc, true);
3481 	spin_unlock_irq(&ioc->lock);
3482 
3483 	blk_mq_unquiesce_queue(q);
3484 	blk_mq_unfreeze_queue(q);
3485 
3486 	blkg_conf_exit(&ctx);
3487 	return nbytes;
3488 
3489 einval:
3490 	spin_unlock_irq(&ioc->lock);
3491 
3492 	blk_mq_unquiesce_queue(q);
3493 	blk_mq_unfreeze_queue(q);
3494 
3495 	ret = -EINVAL;
3496 err:
3497 	blkg_conf_exit(&ctx);
3498 	return ret;
3499 }
3500 
3501 static struct cftype ioc_files[] = {
3502 	{
3503 		.name = "weight",
3504 		.flags = CFTYPE_NOT_ON_ROOT,
3505 		.seq_show = ioc_weight_show,
3506 		.write = ioc_weight_write,
3507 	},
3508 	{
3509 		.name = "cost.qos",
3510 		.flags = CFTYPE_ONLY_ON_ROOT,
3511 		.seq_show = ioc_qos_show,
3512 		.write = ioc_qos_write,
3513 	},
3514 	{
3515 		.name = "cost.model",
3516 		.flags = CFTYPE_ONLY_ON_ROOT,
3517 		.seq_show = ioc_cost_model_show,
3518 		.write = ioc_cost_model_write,
3519 	},
3520 	{}
3521 };
3522 
3523 static struct blkcg_policy blkcg_policy_iocost = {
3524 	.dfl_cftypes	= ioc_files,
3525 	.cpd_alloc_fn	= ioc_cpd_alloc,
3526 	.cpd_free_fn	= ioc_cpd_free,
3527 	.pd_alloc_fn	= ioc_pd_alloc,
3528 	.pd_init_fn	= ioc_pd_init,
3529 	.pd_free_fn	= ioc_pd_free,
3530 	.pd_stat_fn	= ioc_pd_stat,
3531 };
3532 
3533 static int __init ioc_init(void)
3534 {
3535 	return blkcg_policy_register(&blkcg_policy_iocost);
3536 }
3537 
3538 static void __exit ioc_exit(void)
3539 {
3540 	blkcg_policy_unregister(&blkcg_policy_iocost);
3541 }
3542 
3543 module_init(ioc_init);
3544 module_exit(ioc_exit);
3545