xref: /linux/block/blk-iocost.c (revision cbafa54aa2ae23939846e150ad4ba98c784f6395)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * parameters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * 2. Control Strategy
50  *
51  * The device virtual time (vtime) is used as the primary control metric.
52  * The control strategy is composed of the following three parts.
53  *
54  * 2-1. Vtime Distribution
55  *
56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
57  * calculated.  Please consider the following hierarchy where the numbers
58  * inside parentheses denote the configured weights.
59  *
60  *           root
61  *         /       \
62  *      A (w:100)  B (w:300)
63  *      /       \
64  *  A0 (w:100)  A1 (w:100)
65  *
66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69  * 12.5% each.  The distribution mechanism only cares about these flattened
70  * shares.  They're called hweights (hierarchical weights) and always add
71  * upto 1 (WEIGHT_ONE).
72  *
73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75  * against the device vtime - an IO which takes 10ms on the underlying
76  * device is considered to take 80ms on A0.
77  *
78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
80  * the vtime consumed by past IOs and can issue a new IO if doing so
81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
82  * suspended until the vtime has progressed enough to cover it.
83  *
84  * 2-2. Vrate Adjustment
85  *
86  * It's unrealistic to expect the cost model to be perfect.  There are too
87  * many devices and even on the same device the overall performance
88  * fluctuates depending on numerous factors such as IO mixture and device
89  * internal garbage collection.  The controller needs to adapt dynamically.
90  *
91  * This is achieved by adjusting the overall IO rate according to how busy
92  * the device is.  If the device becomes overloaded, we're sending down too
93  * many IOs and should generally slow down.  If there are waiting issuers
94  * but the device isn't saturated, we're issuing too few and should
95  * generally speed up.
96  *
97  * To slow down, we lower the vrate - the rate at which the device vtime
98  * passes compared to the wall clock.  For example, if the vtime is running
99  * at the vrate of 75%, all cgroups added up would only be able to issue
100  * 750ms worth of IOs per second, and vice-versa for speeding up.
101  *
102  * Device business is determined using two criteria - rq wait and
103  * completion latencies.
104  *
105  * When a device gets saturated, the on-device and then the request queues
106  * fill up and a bio which is ready to be issued has to wait for a request
107  * to become available.  When this delay becomes noticeable, it's a clear
108  * indication that the device is saturated and we lower the vrate.  This
109  * saturation signal is fairly conservative as it only triggers when both
110  * hardware and software queues are filled up, and is used as the default
111  * busy signal.
112  *
113  * As devices can have deep queues and be unfair in how the queued commands
114  * are executed, soley depending on rq wait may not result in satisfactory
115  * control quality.  For a better control quality, completion latency QoS
116  * parameters can be configured so that the device is considered saturated
117  * if N'th percentile completion latency rises above the set point.
118  *
119  * The completion latency requirements are a function of both the
120  * underlying device characteristics and the desired IO latency quality of
121  * service.  There is an inherent trade-off - the tighter the latency QoS,
122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
123  * and can be set through /sys/fs/cgroup/io.cost.qos.
124  *
125  * 2-3. Work Conservation
126  *
127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
128  * periodically while B is sending out enough parallel IOs to saturate the
129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
130  * cost per second, i.e., 10% of the device capacity.  The naive
131  * distribution of half and half would lead to 60% utilization of the
132  * device, a significant reduction in the total amount of work done
133  * compared to free-for-all competition.  This is too high a cost to pay
134  * for IO control.
135  *
136  * To conserve the total amount of work done, we keep track of how much
137  * each active cgroup is actually using and yield part of its weight if
138  * there are other cgroups which can make use of it.  In the above case,
139  * A's weight will be lowered so that it hovers above the actual usage and
140  * B would be able to use the rest.
141  *
142  * As we don't want to penalize a cgroup for donating its weight, the
143  * surplus weight adjustment factors in a margin and has an immediate
144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
145  *
146  * Note that adjusting down surplus weights has the same effects as
147  * accelerating vtime for other cgroups and work conservation can also be
148  * implemented by adjusting vrate dynamically.  However, squaring who can
149  * donate and should take back how much requires hweight propagations
150  * anyway making it easier to implement and understand as a separate
151  * mechanism.
152  *
153  * 3. Monitoring
154  *
155  * Instead of debugfs or other clumsy monitoring mechanisms, this
156  * controller uses a drgn based monitoring script -
157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
158  * https://github.com/osandov/drgn.  The output looks like the following.
159  *
160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
161  *                 active      weight      hweight% inflt% dbt  delay usages%
162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
164  *
165  * - per	: Timer period
166  * - cur_per	: Internal wall and device vtime clock
167  * - vrate	: Device virtual time rate against wall clock
168  * - weight	: Surplus-adjusted and configured weights
169  * - hweight	: Surplus-adjusted and configured hierarchical weights
170  * - inflt	: The percentage of in-flight IO cost at the end of last period
171  * - del_ms	: Deferred issuer delay induction level and duration
172  * - usages	: Usage history
173  */
174 
175 #include <linux/kernel.h>
176 #include <linux/module.h>
177 #include <linux/timer.h>
178 #include <linux/time64.h>
179 #include <linux/parser.h>
180 #include <linux/sched/signal.h>
181 #include <linux/blk-cgroup.h>
182 #include <asm/local.h>
183 #include <asm/local64.h>
184 #include "blk-rq-qos.h"
185 #include "blk-stat.h"
186 #include "blk-wbt.h"
187 
188 #ifdef CONFIG_TRACEPOINTS
189 
190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191 #define TRACE_IOCG_PATH_LEN 1024
192 static DEFINE_SPINLOCK(trace_iocg_path_lock);
193 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194 
195 #define TRACE_IOCG_PATH(type, iocg, ...)					\
196 	do {									\
197 		unsigned long flags;						\
198 		if (trace_iocost_##type##_enabled()) {				\
199 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
200 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
201 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
202 			trace_iocost_##type(iocg, trace_iocg_path,		\
203 					      ##__VA_ARGS__);			\
204 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
205 		}								\
206 	} while (0)
207 
208 #else	/* CONFIG_TRACE_POINTS */
209 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
210 #endif	/* CONFIG_TRACE_POINTS */
211 
212 enum {
213 	MILLION			= 1000000,
214 
215 	/* timer period is calculated from latency requirements, bound it */
216 	MIN_PERIOD		= USEC_PER_MSEC,
217 	MAX_PERIOD		= USEC_PER_SEC,
218 
219 	/*
220 	 * iocg->vtime is targeted at 50% behind the device vtime, which
221 	 * serves as its IO credit buffer.  Surplus weight adjustment is
222 	 * immediately canceled if the vtime margin runs below 10%.
223 	 */
224 	MARGIN_MIN_PCT		= 10,
225 	MARGIN_LOW_PCT		= 20,
226 	MARGIN_TARGET_PCT	= 50,
227 
228 	INUSE_ADJ_STEP_PCT	= 25,
229 
230 	/* Have some play in timer operations */
231 	TIMER_SLACK_PCT		= 1,
232 
233 	/* 1/64k is granular enough and can easily be handled w/ u32 */
234 	WEIGHT_ONE		= 1 << 16,
235 
236 	/*
237 	 * As vtime is used to calculate the cost of each IO, it needs to
238 	 * be fairly high precision.  For example, it should be able to
239 	 * represent the cost of a single page worth of discard with
240 	 * suffificient accuracy.  At the same time, it should be able to
241 	 * represent reasonably long enough durations to be useful and
242 	 * convenient during operation.
243 	 *
244 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
245 	 * granularity and days of wrap-around time even at extreme vrates.
246 	 */
247 	VTIME_PER_SEC_SHIFT	= 37,
248 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
249 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
250 	VTIME_PER_NSEC		= VTIME_PER_SEC / NSEC_PER_SEC,
251 
252 	/* bound vrate adjustments within two orders of magnitude */
253 	VRATE_MIN_PPM		= 10000,	/* 1% */
254 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
255 
256 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
257 	VRATE_CLAMP_ADJ_PCT	= 4,
258 
259 	/* if IOs end up waiting for requests, issue less */
260 	RQ_WAIT_BUSY_PCT	= 5,
261 
262 	/* unbusy hysterisis */
263 	UNBUSY_THR_PCT		= 75,
264 
265 	/*
266 	 * The effect of delay is indirect and non-linear and a huge amount of
267 	 * future debt can accumulate abruptly while unthrottled. Linearly scale
268 	 * up delay as debt is going up and then let it decay exponentially.
269 	 * This gives us quick ramp ups while delay is accumulating and long
270 	 * tails which can help reducing the frequency of debt explosions on
271 	 * unthrottle. The parameters are experimentally determined.
272 	 *
273 	 * The delay mechanism provides adequate protection and behavior in many
274 	 * cases. However, this is far from ideal and falls shorts on both
275 	 * fronts. The debtors are often throttled too harshly costing a
276 	 * significant level of fairness and possibly total work while the
277 	 * protection against their impacts on the system can be choppy and
278 	 * unreliable.
279 	 *
280 	 * The shortcoming primarily stems from the fact that, unlike for page
281 	 * cache, the kernel doesn't have well-defined back-pressure propagation
282 	 * mechanism and policies for anonymous memory. Fully addressing this
283 	 * issue will likely require substantial improvements in the area.
284 	 */
285 	MIN_DELAY_THR_PCT	= 500,
286 	MAX_DELAY_THR_PCT	= 25000,
287 	MIN_DELAY		= 250,
288 	MAX_DELAY		= 250 * USEC_PER_MSEC,
289 
290 	/* halve debts if avg usage over 100ms is under 50% */
291 	DFGV_USAGE_PCT		= 50,
292 	DFGV_PERIOD		= 100 * USEC_PER_MSEC,
293 
294 	/* don't let cmds which take a very long time pin lagging for too long */
295 	MAX_LAGGING_PERIODS	= 10,
296 
297 	/* switch iff the conditions are met for longer than this */
298 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
299 
300 	/*
301 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
302 	 * size-proportional components of cost calculation in closer
303 	 * numbers of digits to per-IO cost components.
304 	 */
305 	IOC_PAGE_SHIFT		= 12,
306 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
307 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
308 
309 	/* if apart further than 16M, consider randio for linear model */
310 	LCOEF_RANDIO_PAGES	= 4096,
311 };
312 
313 enum ioc_running {
314 	IOC_IDLE,
315 	IOC_RUNNING,
316 	IOC_STOP,
317 };
318 
319 /* io.cost.qos controls including per-dev enable of the whole controller */
320 enum {
321 	QOS_ENABLE,
322 	QOS_CTRL,
323 	NR_QOS_CTRL_PARAMS,
324 };
325 
326 /* io.cost.qos params */
327 enum {
328 	QOS_RPPM,
329 	QOS_RLAT,
330 	QOS_WPPM,
331 	QOS_WLAT,
332 	QOS_MIN,
333 	QOS_MAX,
334 	NR_QOS_PARAMS,
335 };
336 
337 /* io.cost.model controls */
338 enum {
339 	COST_CTRL,
340 	COST_MODEL,
341 	NR_COST_CTRL_PARAMS,
342 };
343 
344 /* builtin linear cost model coefficients */
345 enum {
346 	I_LCOEF_RBPS,
347 	I_LCOEF_RSEQIOPS,
348 	I_LCOEF_RRANDIOPS,
349 	I_LCOEF_WBPS,
350 	I_LCOEF_WSEQIOPS,
351 	I_LCOEF_WRANDIOPS,
352 	NR_I_LCOEFS,
353 };
354 
355 enum {
356 	LCOEF_RPAGE,
357 	LCOEF_RSEQIO,
358 	LCOEF_RRANDIO,
359 	LCOEF_WPAGE,
360 	LCOEF_WSEQIO,
361 	LCOEF_WRANDIO,
362 	NR_LCOEFS,
363 };
364 
365 enum {
366 	AUTOP_INVALID,
367 	AUTOP_HDD,
368 	AUTOP_SSD_QD1,
369 	AUTOP_SSD_DFL,
370 	AUTOP_SSD_FAST,
371 };
372 
373 struct ioc_params {
374 	u32				qos[NR_QOS_PARAMS];
375 	u64				i_lcoefs[NR_I_LCOEFS];
376 	u64				lcoefs[NR_LCOEFS];
377 	u32				too_fast_vrate_pct;
378 	u32				too_slow_vrate_pct;
379 };
380 
381 struct ioc_margins {
382 	s64				min;
383 	s64				low;
384 	s64				target;
385 };
386 
387 struct ioc_missed {
388 	local_t				nr_met;
389 	local_t				nr_missed;
390 	u32				last_met;
391 	u32				last_missed;
392 };
393 
394 struct ioc_pcpu_stat {
395 	struct ioc_missed		missed[2];
396 
397 	local64_t			rq_wait_ns;
398 	u64				last_rq_wait_ns;
399 };
400 
401 /* per device */
402 struct ioc {
403 	struct rq_qos			rqos;
404 
405 	bool				enabled;
406 
407 	struct ioc_params		params;
408 	struct ioc_margins		margins;
409 	u32				period_us;
410 	u32				timer_slack_ns;
411 	u64				vrate_min;
412 	u64				vrate_max;
413 
414 	spinlock_t			lock;
415 	struct timer_list		timer;
416 	struct list_head		active_iocgs;	/* active cgroups */
417 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
418 
419 	enum ioc_running		running;
420 	atomic64_t			vtime_rate;
421 	u64				vtime_base_rate;
422 	s64				vtime_err;
423 
424 	seqcount_spinlock_t		period_seqcount;
425 	u64				period_at;	/* wallclock starttime */
426 	u64				period_at_vtime; /* vtime starttime */
427 
428 	atomic64_t			cur_period;	/* inc'd each period */
429 	int				busy_level;	/* saturation history */
430 
431 	bool				weights_updated;
432 	atomic_t			hweight_gen;	/* for lazy hweights */
433 
434 	/* debt forgivness */
435 	u64				dfgv_period_at;
436 	u64				dfgv_period_rem;
437 	u64				dfgv_usage_us_sum;
438 
439 	u64				autop_too_fast_at;
440 	u64				autop_too_slow_at;
441 	int				autop_idx;
442 	bool				user_qos_params:1;
443 	bool				user_cost_model:1;
444 };
445 
446 struct iocg_pcpu_stat {
447 	local64_t			abs_vusage;
448 };
449 
450 struct iocg_stat {
451 	u64				usage_us;
452 	u64				wait_us;
453 	u64				indebt_us;
454 	u64				indelay_us;
455 };
456 
457 /* per device-cgroup pair */
458 struct ioc_gq {
459 	struct blkg_policy_data		pd;
460 	struct ioc			*ioc;
461 
462 	/*
463 	 * A iocg can get its weight from two sources - an explicit
464 	 * per-device-cgroup configuration or the default weight of the
465 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
466 	 * configuration.  `weight` is the effective considering both
467 	 * sources.
468 	 *
469 	 * When an idle cgroup becomes active its `active` goes from 0 to
470 	 * `weight`.  `inuse` is the surplus adjusted active weight.
471 	 * `active` and `inuse` are used to calculate `hweight_active` and
472 	 * `hweight_inuse`.
473 	 *
474 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
475 	 * surplus adjustments.
476 	 *
477 	 * `inuse` may be adjusted dynamically during period. `saved_*` are used
478 	 * to determine and track adjustments.
479 	 */
480 	u32				cfg_weight;
481 	u32				weight;
482 	u32				active;
483 	u32				inuse;
484 
485 	u32				last_inuse;
486 	s64				saved_margin;
487 
488 	sector_t			cursor;		/* to detect randio */
489 
490 	/*
491 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
492 	 * issued.  If lagging behind device vtime, the delta represents
493 	 * the currently available IO budget.  If running ahead, the
494 	 * overage.
495 	 *
496 	 * `vtime_done` is the same but progressed on completion rather
497 	 * than issue.  The delta behind `vtime` represents the cost of
498 	 * currently in-flight IOs.
499 	 */
500 	atomic64_t			vtime;
501 	atomic64_t			done_vtime;
502 	u64				abs_vdebt;
503 
504 	/* current delay in effect and when it started */
505 	u64				delay;
506 	u64				delay_at;
507 
508 	/*
509 	 * The period this iocg was last active in.  Used for deactivation
510 	 * and invalidating `vtime`.
511 	 */
512 	atomic64_t			active_period;
513 	struct list_head		active_list;
514 
515 	/* see __propagate_weights() and current_hweight() for details */
516 	u64				child_active_sum;
517 	u64				child_inuse_sum;
518 	u64				child_adjusted_sum;
519 	int				hweight_gen;
520 	u32				hweight_active;
521 	u32				hweight_inuse;
522 	u32				hweight_donating;
523 	u32				hweight_after_donation;
524 
525 	struct list_head		walk_list;
526 	struct list_head		surplus_list;
527 
528 	struct wait_queue_head		waitq;
529 	struct hrtimer			waitq_timer;
530 
531 	/* timestamp at the latest activation */
532 	u64				activated_at;
533 
534 	/* statistics */
535 	struct iocg_pcpu_stat __percpu	*pcpu_stat;
536 	struct iocg_stat		local_stat;
537 	struct iocg_stat		desc_stat;
538 	struct iocg_stat		last_stat;
539 	u64				last_stat_abs_vusage;
540 	u64				usage_delta_us;
541 	u64				wait_since;
542 	u64				indebt_since;
543 	u64				indelay_since;
544 
545 	/* this iocg's depth in the hierarchy and ancestors including self */
546 	int				level;
547 	struct ioc_gq			*ancestors[];
548 };
549 
550 /* per cgroup */
551 struct ioc_cgrp {
552 	struct blkcg_policy_data	cpd;
553 	unsigned int			dfl_weight;
554 };
555 
556 struct ioc_now {
557 	u64				now_ns;
558 	u64				now;
559 	u64				vnow;
560 	u64				vrate;
561 };
562 
563 struct iocg_wait {
564 	struct wait_queue_entry		wait;
565 	struct bio			*bio;
566 	u64				abs_cost;
567 	bool				committed;
568 };
569 
570 struct iocg_wake_ctx {
571 	struct ioc_gq			*iocg;
572 	u32				hw_inuse;
573 	s64				vbudget;
574 };
575 
576 static const struct ioc_params autop[] = {
577 	[AUTOP_HDD] = {
578 		.qos				= {
579 			[QOS_RLAT]		=        250000, /* 250ms */
580 			[QOS_WLAT]		=        250000,
581 			[QOS_MIN]		= VRATE_MIN_PPM,
582 			[QOS_MAX]		= VRATE_MAX_PPM,
583 		},
584 		.i_lcoefs			= {
585 			[I_LCOEF_RBPS]		=     174019176,
586 			[I_LCOEF_RSEQIOPS]	=         41708,
587 			[I_LCOEF_RRANDIOPS]	=           370,
588 			[I_LCOEF_WBPS]		=     178075866,
589 			[I_LCOEF_WSEQIOPS]	=         42705,
590 			[I_LCOEF_WRANDIOPS]	=           378,
591 		},
592 	},
593 	[AUTOP_SSD_QD1] = {
594 		.qos				= {
595 			[QOS_RLAT]		=         25000, /* 25ms */
596 			[QOS_WLAT]		=         25000,
597 			[QOS_MIN]		= VRATE_MIN_PPM,
598 			[QOS_MAX]		= VRATE_MAX_PPM,
599 		},
600 		.i_lcoefs			= {
601 			[I_LCOEF_RBPS]		=     245855193,
602 			[I_LCOEF_RSEQIOPS]	=         61575,
603 			[I_LCOEF_RRANDIOPS]	=          6946,
604 			[I_LCOEF_WBPS]		=     141365009,
605 			[I_LCOEF_WSEQIOPS]	=         33716,
606 			[I_LCOEF_WRANDIOPS]	=         26796,
607 		},
608 	},
609 	[AUTOP_SSD_DFL] = {
610 		.qos				= {
611 			[QOS_RLAT]		=         25000, /* 25ms */
612 			[QOS_WLAT]		=         25000,
613 			[QOS_MIN]		= VRATE_MIN_PPM,
614 			[QOS_MAX]		= VRATE_MAX_PPM,
615 		},
616 		.i_lcoefs			= {
617 			[I_LCOEF_RBPS]		=     488636629,
618 			[I_LCOEF_RSEQIOPS]	=          8932,
619 			[I_LCOEF_RRANDIOPS]	=          8518,
620 			[I_LCOEF_WBPS]		=     427891549,
621 			[I_LCOEF_WSEQIOPS]	=         28755,
622 			[I_LCOEF_WRANDIOPS]	=         21940,
623 		},
624 		.too_fast_vrate_pct		=           500,
625 	},
626 	[AUTOP_SSD_FAST] = {
627 		.qos				= {
628 			[QOS_RLAT]		=          5000, /* 5ms */
629 			[QOS_WLAT]		=          5000,
630 			[QOS_MIN]		= VRATE_MIN_PPM,
631 			[QOS_MAX]		= VRATE_MAX_PPM,
632 		},
633 		.i_lcoefs			= {
634 			[I_LCOEF_RBPS]		=    3102524156LLU,
635 			[I_LCOEF_RSEQIOPS]	=        724816,
636 			[I_LCOEF_RRANDIOPS]	=        778122,
637 			[I_LCOEF_WBPS]		=    1742780862LLU,
638 			[I_LCOEF_WSEQIOPS]	=        425702,
639 			[I_LCOEF_WRANDIOPS]	=	 443193,
640 		},
641 		.too_slow_vrate_pct		=            10,
642 	},
643 };
644 
645 /*
646  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
647  * vtime credit shortage and down on device saturation.
648  */
649 static u32 vrate_adj_pct[] =
650 	{ 0, 0, 0, 0,
651 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
653 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
654 
655 static struct blkcg_policy blkcg_policy_iocost;
656 
657 /* accessors and helpers */
658 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
659 {
660 	return container_of(rqos, struct ioc, rqos);
661 }
662 
663 static struct ioc *q_to_ioc(struct request_queue *q)
664 {
665 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
666 }
667 
668 static const char *q_name(struct request_queue *q)
669 {
670 	if (blk_queue_registered(q))
671 		return kobject_name(q->kobj.parent);
672 	else
673 		return "<unknown>";
674 }
675 
676 static const char __maybe_unused *ioc_name(struct ioc *ioc)
677 {
678 	return q_name(ioc->rqos.q);
679 }
680 
681 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
682 {
683 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
684 }
685 
686 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
687 {
688 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
689 }
690 
691 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
692 {
693 	return pd_to_blkg(&iocg->pd);
694 }
695 
696 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
697 {
698 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
699 			    struct ioc_cgrp, cpd);
700 }
701 
702 /*
703  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
704  * weight, the more expensive each IO.  Must round up.
705  */
706 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
707 {
708 	return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
709 }
710 
711 /*
712  * The inverse of abs_cost_to_cost().  Must round up.
713  */
714 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
715 {
716 	return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
717 }
718 
719 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
720 			    u64 abs_cost, u64 cost)
721 {
722 	struct iocg_pcpu_stat *gcs;
723 
724 	bio->bi_iocost_cost = cost;
725 	atomic64_add(cost, &iocg->vtime);
726 
727 	gcs = get_cpu_ptr(iocg->pcpu_stat);
728 	local64_add(abs_cost, &gcs->abs_vusage);
729 	put_cpu_ptr(gcs);
730 }
731 
732 static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
733 {
734 	if (lock_ioc) {
735 		spin_lock_irqsave(&iocg->ioc->lock, *flags);
736 		spin_lock(&iocg->waitq.lock);
737 	} else {
738 		spin_lock_irqsave(&iocg->waitq.lock, *flags);
739 	}
740 }
741 
742 static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
743 {
744 	if (unlock_ioc) {
745 		spin_unlock(&iocg->waitq.lock);
746 		spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
747 	} else {
748 		spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
749 	}
750 }
751 
752 #define CREATE_TRACE_POINTS
753 #include <trace/events/iocost.h>
754 
755 static void ioc_refresh_margins(struct ioc *ioc)
756 {
757 	struct ioc_margins *margins = &ioc->margins;
758 	u32 period_us = ioc->period_us;
759 	u64 vrate = ioc->vtime_base_rate;
760 
761 	margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
762 	margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
763 	margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
764 }
765 
766 /* latency Qos params changed, update period_us and all the dependent params */
767 static void ioc_refresh_period_us(struct ioc *ioc)
768 {
769 	u32 ppm, lat, multi, period_us;
770 
771 	lockdep_assert_held(&ioc->lock);
772 
773 	/* pick the higher latency target */
774 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
775 		ppm = ioc->params.qos[QOS_RPPM];
776 		lat = ioc->params.qos[QOS_RLAT];
777 	} else {
778 		ppm = ioc->params.qos[QOS_WPPM];
779 		lat = ioc->params.qos[QOS_WLAT];
780 	}
781 
782 	/*
783 	 * We want the period to be long enough to contain a healthy number
784 	 * of IOs while short enough for granular control.  Define it as a
785 	 * multiple of the latency target.  Ideally, the multiplier should
786 	 * be scaled according to the percentile so that it would nominally
787 	 * contain a certain number of requests.  Let's be simpler and
788 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
789 	 */
790 	if (ppm)
791 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
792 	else
793 		multi = 2;
794 	period_us = multi * lat;
795 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
796 
797 	/* calculate dependent params */
798 	ioc->period_us = period_us;
799 	ioc->timer_slack_ns = div64_u64(
800 		(u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
801 		100);
802 	ioc_refresh_margins(ioc);
803 }
804 
805 static int ioc_autop_idx(struct ioc *ioc)
806 {
807 	int idx = ioc->autop_idx;
808 	const struct ioc_params *p = &autop[idx];
809 	u32 vrate_pct;
810 	u64 now_ns;
811 
812 	/* rotational? */
813 	if (!blk_queue_nonrot(ioc->rqos.q))
814 		return AUTOP_HDD;
815 
816 	/* handle SATA SSDs w/ broken NCQ */
817 	if (blk_queue_depth(ioc->rqos.q) == 1)
818 		return AUTOP_SSD_QD1;
819 
820 	/* use one of the normal ssd sets */
821 	if (idx < AUTOP_SSD_DFL)
822 		return AUTOP_SSD_DFL;
823 
824 	/* if user is overriding anything, maintain what was there */
825 	if (ioc->user_qos_params || ioc->user_cost_model)
826 		return idx;
827 
828 	/* step up/down based on the vrate */
829 	vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
830 	now_ns = ktime_get_ns();
831 
832 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
833 		if (!ioc->autop_too_fast_at)
834 			ioc->autop_too_fast_at = now_ns;
835 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
836 			return idx + 1;
837 	} else {
838 		ioc->autop_too_fast_at = 0;
839 	}
840 
841 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
842 		if (!ioc->autop_too_slow_at)
843 			ioc->autop_too_slow_at = now_ns;
844 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
845 			return idx - 1;
846 	} else {
847 		ioc->autop_too_slow_at = 0;
848 	}
849 
850 	return idx;
851 }
852 
853 /*
854  * Take the followings as input
855  *
856  *  @bps	maximum sequential throughput
857  *  @seqiops	maximum sequential 4k iops
858  *  @randiops	maximum random 4k iops
859  *
860  * and calculate the linear model cost coefficients.
861  *
862  *  *@page	per-page cost		1s / (@bps / 4096)
863  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
864  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
865  */
866 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
867 			u64 *page, u64 *seqio, u64 *randio)
868 {
869 	u64 v;
870 
871 	*page = *seqio = *randio = 0;
872 
873 	if (bps)
874 		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
875 					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
876 
877 	if (seqiops) {
878 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
879 		if (v > *page)
880 			*seqio = v - *page;
881 	}
882 
883 	if (randiops) {
884 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
885 		if (v > *page)
886 			*randio = v - *page;
887 	}
888 }
889 
890 static void ioc_refresh_lcoefs(struct ioc *ioc)
891 {
892 	u64 *u = ioc->params.i_lcoefs;
893 	u64 *c = ioc->params.lcoefs;
894 
895 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
896 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
897 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
898 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
899 }
900 
901 static bool ioc_refresh_params(struct ioc *ioc, bool force)
902 {
903 	const struct ioc_params *p;
904 	int idx;
905 
906 	lockdep_assert_held(&ioc->lock);
907 
908 	idx = ioc_autop_idx(ioc);
909 	p = &autop[idx];
910 
911 	if (idx == ioc->autop_idx && !force)
912 		return false;
913 
914 	if (idx != ioc->autop_idx)
915 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
916 
917 	ioc->autop_idx = idx;
918 	ioc->autop_too_fast_at = 0;
919 	ioc->autop_too_slow_at = 0;
920 
921 	if (!ioc->user_qos_params)
922 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
923 	if (!ioc->user_cost_model)
924 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
925 
926 	ioc_refresh_period_us(ioc);
927 	ioc_refresh_lcoefs(ioc);
928 
929 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
930 					    VTIME_PER_USEC, MILLION);
931 	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
932 				   VTIME_PER_USEC, MILLION);
933 
934 	return true;
935 }
936 
937 /*
938  * When an iocg accumulates too much vtime or gets deactivated, we throw away
939  * some vtime, which lowers the overall device utilization. As the exact amount
940  * which is being thrown away is known, we can compensate by accelerating the
941  * vrate accordingly so that the extra vtime generated in the current period
942  * matches what got lost.
943  */
944 static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
945 {
946 	s64 pleft = ioc->period_at + ioc->period_us - now->now;
947 	s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
948 	s64 vcomp, vcomp_min, vcomp_max;
949 
950 	lockdep_assert_held(&ioc->lock);
951 
952 	/* we need some time left in this period */
953 	if (pleft <= 0)
954 		goto done;
955 
956 	/*
957 	 * Calculate how much vrate should be adjusted to offset the error.
958 	 * Limit the amount of adjustment and deduct the adjusted amount from
959 	 * the error.
960 	 */
961 	vcomp = -div64_s64(ioc->vtime_err, pleft);
962 	vcomp_min = -(ioc->vtime_base_rate >> 1);
963 	vcomp_max = ioc->vtime_base_rate;
964 	vcomp = clamp(vcomp, vcomp_min, vcomp_max);
965 
966 	ioc->vtime_err += vcomp * pleft;
967 
968 	atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
969 done:
970 	/* bound how much error can accumulate */
971 	ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
972 }
973 
974 static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
975 				  int nr_lagging, int nr_shortages,
976 				  int prev_busy_level, u32 *missed_ppm)
977 {
978 	u64 vrate = ioc->vtime_base_rate;
979 	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
980 
981 	if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
982 		if (ioc->busy_level != prev_busy_level || nr_lagging)
983 			trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
984 						   missed_ppm, rq_wait_pct,
985 						   nr_lagging, nr_shortages);
986 
987 		return;
988 	}
989 
990 	/*
991 	 * If vrate is out of bounds, apply clamp gradually as the
992 	 * bounds can change abruptly.  Otherwise, apply busy_level
993 	 * based adjustment.
994 	 */
995 	if (vrate < vrate_min) {
996 		vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
997 		vrate = min(vrate, vrate_min);
998 	} else if (vrate > vrate_max) {
999 		vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1000 		vrate = max(vrate, vrate_max);
1001 	} else {
1002 		int idx = min_t(int, abs(ioc->busy_level),
1003 				ARRAY_SIZE(vrate_adj_pct) - 1);
1004 		u32 adj_pct = vrate_adj_pct[idx];
1005 
1006 		if (ioc->busy_level > 0)
1007 			adj_pct = 100 - adj_pct;
1008 		else
1009 			adj_pct = 100 + adj_pct;
1010 
1011 		vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1012 			      vrate_min, vrate_max);
1013 	}
1014 
1015 	trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1016 				   nr_lagging, nr_shortages);
1017 
1018 	ioc->vtime_base_rate = vrate;
1019 	ioc_refresh_margins(ioc);
1020 }
1021 
1022 /* take a snapshot of the current [v]time and vrate */
1023 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1024 {
1025 	unsigned seq;
1026 
1027 	now->now_ns = ktime_get();
1028 	now->now = ktime_to_us(now->now_ns);
1029 	now->vrate = atomic64_read(&ioc->vtime_rate);
1030 
1031 	/*
1032 	 * The current vtime is
1033 	 *
1034 	 *   vtime at period start + (wallclock time since the start) * vrate
1035 	 *
1036 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
1037 	 * needed, they're seqcount protected.
1038 	 */
1039 	do {
1040 		seq = read_seqcount_begin(&ioc->period_seqcount);
1041 		now->vnow = ioc->period_at_vtime +
1042 			(now->now - ioc->period_at) * now->vrate;
1043 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
1044 }
1045 
1046 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1047 {
1048 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1049 
1050 	write_seqcount_begin(&ioc->period_seqcount);
1051 	ioc->period_at = now->now;
1052 	ioc->period_at_vtime = now->vnow;
1053 	write_seqcount_end(&ioc->period_seqcount);
1054 
1055 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1056 	add_timer(&ioc->timer);
1057 }
1058 
1059 /*
1060  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
1061  * weight sums and propagate upwards accordingly. If @save, the current margin
1062  * is saved to be used as reference for later inuse in-period adjustments.
1063  */
1064 static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1065 				bool save, struct ioc_now *now)
1066 {
1067 	struct ioc *ioc = iocg->ioc;
1068 	int lvl;
1069 
1070 	lockdep_assert_held(&ioc->lock);
1071 
1072 	inuse = clamp_t(u32, inuse, 1, active);
1073 
1074 	iocg->last_inuse = iocg->inuse;
1075 	if (save)
1076 		iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1077 
1078 	if (active == iocg->active && inuse == iocg->inuse)
1079 		return;
1080 
1081 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1082 		struct ioc_gq *parent = iocg->ancestors[lvl];
1083 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1084 		u32 parent_active = 0, parent_inuse = 0;
1085 
1086 		/* update the level sums */
1087 		parent->child_active_sum += (s32)(active - child->active);
1088 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
1089 		/* apply the udpates */
1090 		child->active = active;
1091 		child->inuse = inuse;
1092 
1093 		/*
1094 		 * The delta between inuse and active sums indicates that
1095 		 * much of weight is being given away.  Parent's inuse
1096 		 * and active should reflect the ratio.
1097 		 */
1098 		if (parent->child_active_sum) {
1099 			parent_active = parent->weight;
1100 			parent_inuse = DIV64_U64_ROUND_UP(
1101 				parent_active * parent->child_inuse_sum,
1102 				parent->child_active_sum);
1103 		}
1104 
1105 		/* do we need to keep walking up? */
1106 		if (parent_active == parent->active &&
1107 		    parent_inuse == parent->inuse)
1108 			break;
1109 
1110 		active = parent_active;
1111 		inuse = parent_inuse;
1112 	}
1113 
1114 	ioc->weights_updated = true;
1115 }
1116 
1117 static void commit_weights(struct ioc *ioc)
1118 {
1119 	lockdep_assert_held(&ioc->lock);
1120 
1121 	if (ioc->weights_updated) {
1122 		/* paired with rmb in current_hweight(), see there */
1123 		smp_wmb();
1124 		atomic_inc(&ioc->hweight_gen);
1125 		ioc->weights_updated = false;
1126 	}
1127 }
1128 
1129 static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1130 			      bool save, struct ioc_now *now)
1131 {
1132 	__propagate_weights(iocg, active, inuse, save, now);
1133 	commit_weights(iocg->ioc);
1134 }
1135 
1136 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1137 {
1138 	struct ioc *ioc = iocg->ioc;
1139 	int lvl;
1140 	u32 hwa, hwi;
1141 	int ioc_gen;
1142 
1143 	/* hot path - if uptodate, use cached */
1144 	ioc_gen = atomic_read(&ioc->hweight_gen);
1145 	if (ioc_gen == iocg->hweight_gen)
1146 		goto out;
1147 
1148 	/*
1149 	 * Paired with wmb in commit_weights(). If we saw the updated
1150 	 * hweight_gen, all the weight updates from __propagate_weights() are
1151 	 * visible too.
1152 	 *
1153 	 * We can race with weight updates during calculation and get it
1154 	 * wrong.  However, hweight_gen would have changed and a future
1155 	 * reader will recalculate and we're guaranteed to discard the
1156 	 * wrong result soon.
1157 	 */
1158 	smp_rmb();
1159 
1160 	hwa = hwi = WEIGHT_ONE;
1161 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1162 		struct ioc_gq *parent = iocg->ancestors[lvl];
1163 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1164 		u64 active_sum = READ_ONCE(parent->child_active_sum);
1165 		u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1166 		u32 active = READ_ONCE(child->active);
1167 		u32 inuse = READ_ONCE(child->inuse);
1168 
1169 		/* we can race with deactivations and either may read as zero */
1170 		if (!active_sum || !inuse_sum)
1171 			continue;
1172 
1173 		active_sum = max_t(u64, active, active_sum);
1174 		hwa = div64_u64((u64)hwa * active, active_sum);
1175 
1176 		inuse_sum = max_t(u64, inuse, inuse_sum);
1177 		hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1178 	}
1179 
1180 	iocg->hweight_active = max_t(u32, hwa, 1);
1181 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1182 	iocg->hweight_gen = ioc_gen;
1183 out:
1184 	if (hw_activep)
1185 		*hw_activep = iocg->hweight_active;
1186 	if (hw_inusep)
1187 		*hw_inusep = iocg->hweight_inuse;
1188 }
1189 
1190 /*
1191  * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1192  * other weights stay unchanged.
1193  */
1194 static u32 current_hweight_max(struct ioc_gq *iocg)
1195 {
1196 	u32 hwm = WEIGHT_ONE;
1197 	u32 inuse = iocg->active;
1198 	u64 child_inuse_sum;
1199 	int lvl;
1200 
1201 	lockdep_assert_held(&iocg->ioc->lock);
1202 
1203 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1204 		struct ioc_gq *parent = iocg->ancestors[lvl];
1205 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1206 
1207 		child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1208 		hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1209 		inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1210 					   parent->child_active_sum);
1211 	}
1212 
1213 	return max_t(u32, hwm, 1);
1214 }
1215 
1216 static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1217 {
1218 	struct ioc *ioc = iocg->ioc;
1219 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1220 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1221 	u32 weight;
1222 
1223 	lockdep_assert_held(&ioc->lock);
1224 
1225 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1226 	if (weight != iocg->weight && iocg->active)
1227 		propagate_weights(iocg, weight, iocg->inuse, true, now);
1228 	iocg->weight = weight;
1229 }
1230 
1231 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1232 {
1233 	struct ioc *ioc = iocg->ioc;
1234 	u64 last_period, cur_period;
1235 	u64 vtime, vtarget;
1236 	int i;
1237 
1238 	/*
1239 	 * If seem to be already active, just update the stamp to tell the
1240 	 * timer that we're still active.  We don't mind occassional races.
1241 	 */
1242 	if (!list_empty(&iocg->active_list)) {
1243 		ioc_now(ioc, now);
1244 		cur_period = atomic64_read(&ioc->cur_period);
1245 		if (atomic64_read(&iocg->active_period) != cur_period)
1246 			atomic64_set(&iocg->active_period, cur_period);
1247 		return true;
1248 	}
1249 
1250 	/* racy check on internal node IOs, treat as root level IOs */
1251 	if (iocg->child_active_sum)
1252 		return false;
1253 
1254 	spin_lock_irq(&ioc->lock);
1255 
1256 	ioc_now(ioc, now);
1257 
1258 	/* update period */
1259 	cur_period = atomic64_read(&ioc->cur_period);
1260 	last_period = atomic64_read(&iocg->active_period);
1261 	atomic64_set(&iocg->active_period, cur_period);
1262 
1263 	/* already activated or breaking leaf-only constraint? */
1264 	if (!list_empty(&iocg->active_list))
1265 		goto succeed_unlock;
1266 	for (i = iocg->level - 1; i > 0; i--)
1267 		if (!list_empty(&iocg->ancestors[i]->active_list))
1268 			goto fail_unlock;
1269 
1270 	if (iocg->child_active_sum)
1271 		goto fail_unlock;
1272 
1273 	/*
1274 	 * Always start with the target budget. On deactivation, we throw away
1275 	 * anything above it.
1276 	 */
1277 	vtarget = now->vnow - ioc->margins.target;
1278 	vtime = atomic64_read(&iocg->vtime);
1279 
1280 	atomic64_add(vtarget - vtime, &iocg->vtime);
1281 	atomic64_add(vtarget - vtime, &iocg->done_vtime);
1282 	vtime = vtarget;
1283 
1284 	/*
1285 	 * Activate, propagate weight and start period timer if not
1286 	 * running.  Reset hweight_gen to avoid accidental match from
1287 	 * wrapping.
1288 	 */
1289 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1290 	list_add(&iocg->active_list, &ioc->active_iocgs);
1291 
1292 	propagate_weights(iocg, iocg->weight,
1293 			  iocg->last_inuse ?: iocg->weight, true, now);
1294 
1295 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1296 			last_period, cur_period, vtime);
1297 
1298 	iocg->activated_at = now->now;
1299 
1300 	if (ioc->running == IOC_IDLE) {
1301 		ioc->running = IOC_RUNNING;
1302 		ioc->dfgv_period_at = now->now;
1303 		ioc->dfgv_period_rem = 0;
1304 		ioc_start_period(ioc, now);
1305 	}
1306 
1307 succeed_unlock:
1308 	spin_unlock_irq(&ioc->lock);
1309 	return true;
1310 
1311 fail_unlock:
1312 	spin_unlock_irq(&ioc->lock);
1313 	return false;
1314 }
1315 
1316 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1317 {
1318 	struct ioc *ioc = iocg->ioc;
1319 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1320 	u64 tdelta, delay, new_delay;
1321 	s64 vover, vover_pct;
1322 	u32 hwa;
1323 
1324 	lockdep_assert_held(&iocg->waitq.lock);
1325 
1326 	/* calculate the current delay in effect - 1/2 every second */
1327 	tdelta = now->now - iocg->delay_at;
1328 	if (iocg->delay)
1329 		delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1330 	else
1331 		delay = 0;
1332 
1333 	/* calculate the new delay from the debt amount */
1334 	current_hweight(iocg, &hwa, NULL);
1335 	vover = atomic64_read(&iocg->vtime) +
1336 		abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1337 	vover_pct = div64_s64(100 * vover,
1338 			      ioc->period_us * ioc->vtime_base_rate);
1339 
1340 	if (vover_pct <= MIN_DELAY_THR_PCT)
1341 		new_delay = 0;
1342 	else if (vover_pct >= MAX_DELAY_THR_PCT)
1343 		new_delay = MAX_DELAY;
1344 	else
1345 		new_delay = MIN_DELAY +
1346 			div_u64((MAX_DELAY - MIN_DELAY) *
1347 				(vover_pct - MIN_DELAY_THR_PCT),
1348 				MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1349 
1350 	/* pick the higher one and apply */
1351 	if (new_delay > delay) {
1352 		iocg->delay = new_delay;
1353 		iocg->delay_at = now->now;
1354 		delay = new_delay;
1355 	}
1356 
1357 	if (delay >= MIN_DELAY) {
1358 		if (!iocg->indelay_since)
1359 			iocg->indelay_since = now->now;
1360 		blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1361 		return true;
1362 	} else {
1363 		if (iocg->indelay_since) {
1364 			iocg->local_stat.indelay_us += now->now - iocg->indelay_since;
1365 			iocg->indelay_since = 0;
1366 		}
1367 		iocg->delay = 0;
1368 		blkcg_clear_delay(blkg);
1369 		return false;
1370 	}
1371 }
1372 
1373 static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1374 			    struct ioc_now *now)
1375 {
1376 	struct iocg_pcpu_stat *gcs;
1377 
1378 	lockdep_assert_held(&iocg->ioc->lock);
1379 	lockdep_assert_held(&iocg->waitq.lock);
1380 	WARN_ON_ONCE(list_empty(&iocg->active_list));
1381 
1382 	/*
1383 	 * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1384 	 * inuse donating all of it share to others until its debt is paid off.
1385 	 */
1386 	if (!iocg->abs_vdebt && abs_cost) {
1387 		iocg->indebt_since = now->now;
1388 		propagate_weights(iocg, iocg->active, 0, false, now);
1389 	}
1390 
1391 	iocg->abs_vdebt += abs_cost;
1392 
1393 	gcs = get_cpu_ptr(iocg->pcpu_stat);
1394 	local64_add(abs_cost, &gcs->abs_vusage);
1395 	put_cpu_ptr(gcs);
1396 }
1397 
1398 static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1399 			  struct ioc_now *now)
1400 {
1401 	lockdep_assert_held(&iocg->ioc->lock);
1402 	lockdep_assert_held(&iocg->waitq.lock);
1403 
1404 	/* make sure that nobody messed with @iocg */
1405 	WARN_ON_ONCE(list_empty(&iocg->active_list));
1406 	WARN_ON_ONCE(iocg->inuse > 1);
1407 
1408 	iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1409 
1410 	/* if debt is paid in full, restore inuse */
1411 	if (!iocg->abs_vdebt) {
1412 		iocg->local_stat.indebt_us += now->now - iocg->indebt_since;
1413 		iocg->indebt_since = 0;
1414 
1415 		propagate_weights(iocg, iocg->active, iocg->last_inuse,
1416 				  false, now);
1417 	}
1418 }
1419 
1420 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1421 			int flags, void *key)
1422 {
1423 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1424 	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1425 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1426 
1427 	ctx->vbudget -= cost;
1428 
1429 	if (ctx->vbudget < 0)
1430 		return -1;
1431 
1432 	iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1433 
1434 	/*
1435 	 * autoremove_wake_function() removes the wait entry only when it
1436 	 * actually changed the task state.  We want the wait always
1437 	 * removed.  Remove explicitly and use default_wake_function().
1438 	 */
1439 	list_del_init(&wq_entry->entry);
1440 	wait->committed = true;
1441 
1442 	default_wake_function(wq_entry, mode, flags, key);
1443 	return 0;
1444 }
1445 
1446 /*
1447  * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1448  * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1449  * addition to iocg->waitq.lock.
1450  */
1451 static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1452 			    struct ioc_now *now)
1453 {
1454 	struct ioc *ioc = iocg->ioc;
1455 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1456 	u64 vshortage, expires, oexpires;
1457 	s64 vbudget;
1458 	u32 hwa;
1459 
1460 	lockdep_assert_held(&iocg->waitq.lock);
1461 
1462 	current_hweight(iocg, &hwa, NULL);
1463 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1464 
1465 	/* pay off debt */
1466 	if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1467 		u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1468 		u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1469 		u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1470 
1471 		lockdep_assert_held(&ioc->lock);
1472 
1473 		atomic64_add(vpay, &iocg->vtime);
1474 		atomic64_add(vpay, &iocg->done_vtime);
1475 		iocg_pay_debt(iocg, abs_vpay, now);
1476 		vbudget -= vpay;
1477 	}
1478 
1479 	if (iocg->abs_vdebt || iocg->delay)
1480 		iocg_kick_delay(iocg, now);
1481 
1482 	/*
1483 	 * Debt can still be outstanding if we haven't paid all yet or the
1484 	 * caller raced and called without @pay_debt. Shouldn't wake up waiters
1485 	 * under debt. Make sure @vbudget reflects the outstanding amount and is
1486 	 * not positive.
1487 	 */
1488 	if (iocg->abs_vdebt) {
1489 		s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1490 		vbudget = min_t(s64, 0, vbudget - vdebt);
1491 	}
1492 
1493 	/*
1494 	 * Wake up the ones which are due and see how much vtime we'll need for
1495 	 * the next one. As paying off debt restores hw_inuse, it must be read
1496 	 * after the above debt payment.
1497 	 */
1498 	ctx.vbudget = vbudget;
1499 	current_hweight(iocg, NULL, &ctx.hw_inuse);
1500 
1501 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1502 
1503 	if (!waitqueue_active(&iocg->waitq)) {
1504 		if (iocg->wait_since) {
1505 			iocg->local_stat.wait_us += now->now - iocg->wait_since;
1506 			iocg->wait_since = 0;
1507 		}
1508 		return;
1509 	}
1510 
1511 	if (!iocg->wait_since)
1512 		iocg->wait_since = now->now;
1513 
1514 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1515 		return;
1516 
1517 	/* determine next wakeup, add a timer margin to guarantee chunking */
1518 	vshortage = -ctx.vbudget;
1519 	expires = now->now_ns +
1520 		DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1521 		NSEC_PER_USEC;
1522 	expires += ioc->timer_slack_ns;
1523 
1524 	/* if already active and close enough, don't bother */
1525 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1526 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1527 	    abs(oexpires - expires) <= ioc->timer_slack_ns)
1528 		return;
1529 
1530 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1531 			       ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1532 }
1533 
1534 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1535 {
1536 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1537 	bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1538 	struct ioc_now now;
1539 	unsigned long flags;
1540 
1541 	ioc_now(iocg->ioc, &now);
1542 
1543 	iocg_lock(iocg, pay_debt, &flags);
1544 	iocg_kick_waitq(iocg, pay_debt, &now);
1545 	iocg_unlock(iocg, pay_debt, &flags);
1546 
1547 	return HRTIMER_NORESTART;
1548 }
1549 
1550 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1551 {
1552 	u32 nr_met[2] = { };
1553 	u32 nr_missed[2] = { };
1554 	u64 rq_wait_ns = 0;
1555 	int cpu, rw;
1556 
1557 	for_each_online_cpu(cpu) {
1558 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1559 		u64 this_rq_wait_ns;
1560 
1561 		for (rw = READ; rw <= WRITE; rw++) {
1562 			u32 this_met = local_read(&stat->missed[rw].nr_met);
1563 			u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1564 
1565 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1566 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1567 			stat->missed[rw].last_met = this_met;
1568 			stat->missed[rw].last_missed = this_missed;
1569 		}
1570 
1571 		this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1572 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1573 		stat->last_rq_wait_ns = this_rq_wait_ns;
1574 	}
1575 
1576 	for (rw = READ; rw <= WRITE; rw++) {
1577 		if (nr_met[rw] + nr_missed[rw])
1578 			missed_ppm_ar[rw] =
1579 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1580 						   nr_met[rw] + nr_missed[rw]);
1581 		else
1582 			missed_ppm_ar[rw] = 0;
1583 	}
1584 
1585 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1586 				   ioc->period_us * NSEC_PER_USEC);
1587 }
1588 
1589 /* was iocg idle this period? */
1590 static bool iocg_is_idle(struct ioc_gq *iocg)
1591 {
1592 	struct ioc *ioc = iocg->ioc;
1593 
1594 	/* did something get issued this period? */
1595 	if (atomic64_read(&iocg->active_period) ==
1596 	    atomic64_read(&ioc->cur_period))
1597 		return false;
1598 
1599 	/* is something in flight? */
1600 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1601 		return false;
1602 
1603 	return true;
1604 }
1605 
1606 /*
1607  * Call this function on the target leaf @iocg's to build pre-order traversal
1608  * list of all the ancestors in @inner_walk. The inner nodes are linked through
1609  * ->walk_list and the caller is responsible for dissolving the list after use.
1610  */
1611 static void iocg_build_inner_walk(struct ioc_gq *iocg,
1612 				  struct list_head *inner_walk)
1613 {
1614 	int lvl;
1615 
1616 	WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1617 
1618 	/* find the first ancestor which hasn't been visited yet */
1619 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1620 		if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1621 			break;
1622 	}
1623 
1624 	/* walk down and visit the inner nodes to get pre-order traversal */
1625 	while (++lvl <= iocg->level - 1) {
1626 		struct ioc_gq *inner = iocg->ancestors[lvl];
1627 
1628 		/* record traversal order */
1629 		list_add_tail(&inner->walk_list, inner_walk);
1630 	}
1631 }
1632 
1633 /* collect per-cpu counters and propagate the deltas to the parent */
1634 static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1635 {
1636 	struct ioc *ioc = iocg->ioc;
1637 	struct iocg_stat new_stat;
1638 	u64 abs_vusage = 0;
1639 	u64 vusage_delta;
1640 	int cpu;
1641 
1642 	lockdep_assert_held(&iocg->ioc->lock);
1643 
1644 	/* collect per-cpu counters */
1645 	for_each_possible_cpu(cpu) {
1646 		abs_vusage += local64_read(
1647 				per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1648 	}
1649 	vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1650 	iocg->last_stat_abs_vusage = abs_vusage;
1651 
1652 	iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1653 	iocg->local_stat.usage_us += iocg->usage_delta_us;
1654 
1655 	/* propagate upwards */
1656 	new_stat.usage_us =
1657 		iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
1658 	new_stat.wait_us =
1659 		iocg->local_stat.wait_us + iocg->desc_stat.wait_us;
1660 	new_stat.indebt_us =
1661 		iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us;
1662 	new_stat.indelay_us =
1663 		iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us;
1664 
1665 	/* propagate the deltas to the parent */
1666 	if (iocg->level > 0) {
1667 		struct iocg_stat *parent_stat =
1668 			&iocg->ancestors[iocg->level - 1]->desc_stat;
1669 
1670 		parent_stat->usage_us +=
1671 			new_stat.usage_us - iocg->last_stat.usage_us;
1672 		parent_stat->wait_us +=
1673 			new_stat.wait_us - iocg->last_stat.wait_us;
1674 		parent_stat->indebt_us +=
1675 			new_stat.indebt_us - iocg->last_stat.indebt_us;
1676 		parent_stat->indelay_us +=
1677 			new_stat.indelay_us - iocg->last_stat.indelay_us;
1678 	}
1679 
1680 	iocg->last_stat = new_stat;
1681 }
1682 
1683 /* get stat counters ready for reading on all active iocgs */
1684 static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1685 {
1686 	LIST_HEAD(inner_walk);
1687 	struct ioc_gq *iocg, *tiocg;
1688 
1689 	/* flush leaves and build inner node walk list */
1690 	list_for_each_entry(iocg, target_iocgs, active_list) {
1691 		iocg_flush_stat_one(iocg, now);
1692 		iocg_build_inner_walk(iocg, &inner_walk);
1693 	}
1694 
1695 	/* keep flushing upwards by walking the inner list backwards */
1696 	list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1697 		iocg_flush_stat_one(iocg, now);
1698 		list_del_init(&iocg->walk_list);
1699 	}
1700 }
1701 
1702 /*
1703  * Determine what @iocg's hweight_inuse should be after donating unused
1704  * capacity. @hwm is the upper bound and used to signal no donation. This
1705  * function also throws away @iocg's excess budget.
1706  */
1707 static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1708 				  u32 usage, struct ioc_now *now)
1709 {
1710 	struct ioc *ioc = iocg->ioc;
1711 	u64 vtime = atomic64_read(&iocg->vtime);
1712 	s64 excess, delta, target, new_hwi;
1713 
1714 	/* debt handling owns inuse for debtors */
1715 	if (iocg->abs_vdebt)
1716 		return 1;
1717 
1718 	/* see whether minimum margin requirement is met */
1719 	if (waitqueue_active(&iocg->waitq) ||
1720 	    time_after64(vtime, now->vnow - ioc->margins.min))
1721 		return hwm;
1722 
1723 	/* throw away excess above target */
1724 	excess = now->vnow - vtime - ioc->margins.target;
1725 	if (excess > 0) {
1726 		atomic64_add(excess, &iocg->vtime);
1727 		atomic64_add(excess, &iocg->done_vtime);
1728 		vtime += excess;
1729 		ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1730 	}
1731 
1732 	/*
1733 	 * Let's say the distance between iocg's and device's vtimes as a
1734 	 * fraction of period duration is delta. Assuming that the iocg will
1735 	 * consume the usage determined above, we want to determine new_hwi so
1736 	 * that delta equals MARGIN_TARGET at the end of the next period.
1737 	 *
1738 	 * We need to execute usage worth of IOs while spending the sum of the
1739 	 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1740 	 * (delta):
1741 	 *
1742 	 *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
1743 	 *
1744 	 * Therefore, the new_hwi is:
1745 	 *
1746 	 *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
1747 	 */
1748 	delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1749 			  now->vnow - ioc->period_at_vtime);
1750 	target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1751 	new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1752 
1753 	return clamp_t(s64, new_hwi, 1, hwm);
1754 }
1755 
1756 /*
1757  * For work-conservation, an iocg which isn't using all of its share should
1758  * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1759  * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1760  *
1761  * #1 is mathematically simpler but has the drawback of requiring synchronous
1762  * global hweight_inuse updates when idle iocg's get activated or inuse weights
1763  * change due to donation snapbacks as it has the possibility of grossly
1764  * overshooting what's allowed by the model and vrate.
1765  *
1766  * #2 is inherently safe with local operations. The donating iocg can easily
1767  * snap back to higher weights when needed without worrying about impacts on
1768  * other nodes as the impacts will be inherently correct. This also makes idle
1769  * iocg activations safe. The only effect activations have is decreasing
1770  * hweight_inuse of others, the right solution to which is for those iocgs to
1771  * snap back to higher weights.
1772  *
1773  * So, we go with #2. The challenge is calculating how each donating iocg's
1774  * inuse should be adjusted to achieve the target donation amounts. This is done
1775  * using Andy's method described in the following pdf.
1776  *
1777  *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1778  *
1779  * Given the weights and target after-donation hweight_inuse values, Andy's
1780  * method determines how the proportional distribution should look like at each
1781  * sibling level to maintain the relative relationship between all non-donating
1782  * pairs. To roughly summarize, it divides the tree into donating and
1783  * non-donating parts, calculates global donation rate which is used to
1784  * determine the target hweight_inuse for each node, and then derives per-level
1785  * proportions.
1786  *
1787  * The following pdf shows that global distribution calculated this way can be
1788  * achieved by scaling inuse weights of donating leaves and propagating the
1789  * adjustments upwards proportionally.
1790  *
1791  *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1792  *
1793  * Combining the above two, we can determine how each leaf iocg's inuse should
1794  * be adjusted to achieve the target donation.
1795  *
1796  *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1797  *
1798  * The inline comments use symbols from the last pdf.
1799  *
1800  *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
1801  *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
1802  *   t is the sum of the absolute budgets of donating nodes in the subtree.
1803  *   w is the weight of the node. w = w_f + w_t
1804  *   w_f is the non-donating portion of w. w_f = w * f / b
1805  *   w_b is the donating portion of w. w_t = w * t / b
1806  *   s is the sum of all sibling weights. s = Sum(w) for siblings
1807  *   s_f and s_t are the non-donating and donating portions of s.
1808  *
1809  * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1810  * w_pt is the donating portion of the parent's weight and w'_pt the same value
1811  * after adjustments. Subscript r denotes the root node's values.
1812  */
1813 static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1814 {
1815 	LIST_HEAD(over_hwa);
1816 	LIST_HEAD(inner_walk);
1817 	struct ioc_gq *iocg, *tiocg, *root_iocg;
1818 	u32 after_sum, over_sum, over_target, gamma;
1819 
1820 	/*
1821 	 * It's pretty unlikely but possible for the total sum of
1822 	 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1823 	 * confuse the following calculations. If such condition is detected,
1824 	 * scale down everyone over its full share equally to keep the sum below
1825 	 * WEIGHT_ONE.
1826 	 */
1827 	after_sum = 0;
1828 	over_sum = 0;
1829 	list_for_each_entry(iocg, surpluses, surplus_list) {
1830 		u32 hwa;
1831 
1832 		current_hweight(iocg, &hwa, NULL);
1833 		after_sum += iocg->hweight_after_donation;
1834 
1835 		if (iocg->hweight_after_donation > hwa) {
1836 			over_sum += iocg->hweight_after_donation;
1837 			list_add(&iocg->walk_list, &over_hwa);
1838 		}
1839 	}
1840 
1841 	if (after_sum >= WEIGHT_ONE) {
1842 		/*
1843 		 * The delta should be deducted from the over_sum, calculate
1844 		 * target over_sum value.
1845 		 */
1846 		u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1847 		WARN_ON_ONCE(over_sum <= over_delta);
1848 		over_target = over_sum - over_delta;
1849 	} else {
1850 		over_target = 0;
1851 	}
1852 
1853 	list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1854 		if (over_target)
1855 			iocg->hweight_after_donation =
1856 				div_u64((u64)iocg->hweight_after_donation *
1857 					over_target, over_sum);
1858 		list_del_init(&iocg->walk_list);
1859 	}
1860 
1861 	/*
1862 	 * Build pre-order inner node walk list and prepare for donation
1863 	 * adjustment calculations.
1864 	 */
1865 	list_for_each_entry(iocg, surpluses, surplus_list) {
1866 		iocg_build_inner_walk(iocg, &inner_walk);
1867 	}
1868 
1869 	root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1870 	WARN_ON_ONCE(root_iocg->level > 0);
1871 
1872 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1873 		iocg->child_adjusted_sum = 0;
1874 		iocg->hweight_donating = 0;
1875 		iocg->hweight_after_donation = 0;
1876 	}
1877 
1878 	/*
1879 	 * Propagate the donating budget (b_t) and after donation budget (b'_t)
1880 	 * up the hierarchy.
1881 	 */
1882 	list_for_each_entry(iocg, surpluses, surplus_list) {
1883 		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1884 
1885 		parent->hweight_donating += iocg->hweight_donating;
1886 		parent->hweight_after_donation += iocg->hweight_after_donation;
1887 	}
1888 
1889 	list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1890 		if (iocg->level > 0) {
1891 			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1892 
1893 			parent->hweight_donating += iocg->hweight_donating;
1894 			parent->hweight_after_donation += iocg->hweight_after_donation;
1895 		}
1896 	}
1897 
1898 	/*
1899 	 * Calculate inner hwa's (b) and make sure the donation values are
1900 	 * within the accepted ranges as we're doing low res calculations with
1901 	 * roundups.
1902 	 */
1903 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1904 		if (iocg->level) {
1905 			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1906 
1907 			iocg->hweight_active = DIV64_U64_ROUND_UP(
1908 				(u64)parent->hweight_active * iocg->active,
1909 				parent->child_active_sum);
1910 
1911 		}
1912 
1913 		iocg->hweight_donating = min(iocg->hweight_donating,
1914 					     iocg->hweight_active);
1915 		iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1916 						   iocg->hweight_donating - 1);
1917 		if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1918 				 iocg->hweight_donating <= 1 ||
1919 				 iocg->hweight_after_donation == 0)) {
1920 			pr_warn("iocg: invalid donation weights in ");
1921 			pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1922 			pr_cont(": active=%u donating=%u after=%u\n",
1923 				iocg->hweight_active, iocg->hweight_donating,
1924 				iocg->hweight_after_donation);
1925 		}
1926 	}
1927 
1928 	/*
1929 	 * Calculate the global donation rate (gamma) - the rate to adjust
1930 	 * non-donating budgets by.
1931 	 *
1932 	 * No need to use 64bit multiplication here as the first operand is
1933 	 * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1934 	 *
1935 	 * We know that there are beneficiary nodes and the sum of the donating
1936 	 * hweights can't be whole; however, due to the round-ups during hweight
1937 	 * calculations, root_iocg->hweight_donating might still end up equal to
1938 	 * or greater than whole. Limit the range when calculating the divider.
1939 	 *
1940 	 * gamma = (1 - t_r') / (1 - t_r)
1941 	 */
1942 	gamma = DIV_ROUND_UP(
1943 		(WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1944 		WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1945 
1946 	/*
1947 	 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1948 	 * nodes.
1949 	 */
1950 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1951 		struct ioc_gq *parent;
1952 		u32 inuse, wpt, wptp;
1953 		u64 st, sf;
1954 
1955 		if (iocg->level == 0) {
1956 			/* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1957 			iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1958 				iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1959 				WEIGHT_ONE - iocg->hweight_after_donation);
1960 			continue;
1961 		}
1962 
1963 		parent = iocg->ancestors[iocg->level - 1];
1964 
1965 		/* b' = gamma * b_f + b_t' */
1966 		iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1967 			(u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1968 			WEIGHT_ONE) + iocg->hweight_after_donation;
1969 
1970 		/* w' = s' * b' / b'_p */
1971 		inuse = DIV64_U64_ROUND_UP(
1972 			(u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1973 			parent->hweight_inuse);
1974 
1975 		/* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
1976 		st = DIV64_U64_ROUND_UP(
1977 			iocg->child_active_sum * iocg->hweight_donating,
1978 			iocg->hweight_active);
1979 		sf = iocg->child_active_sum - st;
1980 		wpt = DIV64_U64_ROUND_UP(
1981 			(u64)iocg->active * iocg->hweight_donating,
1982 			iocg->hweight_active);
1983 		wptp = DIV64_U64_ROUND_UP(
1984 			(u64)inuse * iocg->hweight_after_donation,
1985 			iocg->hweight_inuse);
1986 
1987 		iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
1988 	}
1989 
1990 	/*
1991 	 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
1992 	 * we can finally determine leaf adjustments.
1993 	 */
1994 	list_for_each_entry(iocg, surpluses, surplus_list) {
1995 		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1996 		u32 inuse;
1997 
1998 		/*
1999 		 * In-debt iocgs participated in the donation calculation with
2000 		 * the minimum target hweight_inuse. Configuring inuse
2001 		 * accordingly would work fine but debt handling expects
2002 		 * @iocg->inuse stay at the minimum and we don't wanna
2003 		 * interfere.
2004 		 */
2005 		if (iocg->abs_vdebt) {
2006 			WARN_ON_ONCE(iocg->inuse > 1);
2007 			continue;
2008 		}
2009 
2010 		/* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
2011 		inuse = DIV64_U64_ROUND_UP(
2012 			parent->child_adjusted_sum * iocg->hweight_after_donation,
2013 			parent->hweight_inuse);
2014 
2015 		TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2016 				iocg->inuse, inuse,
2017 				iocg->hweight_inuse,
2018 				iocg->hweight_after_donation);
2019 
2020 		__propagate_weights(iocg, iocg->active, inuse, true, now);
2021 	}
2022 
2023 	/* walk list should be dissolved after use */
2024 	list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2025 		list_del_init(&iocg->walk_list);
2026 }
2027 
2028 /*
2029  * A low weight iocg can amass a large amount of debt, for example, when
2030  * anonymous memory gets reclaimed aggressively. If the system has a lot of
2031  * memory paired with a slow IO device, the debt can span multiple seconds or
2032  * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2033  * up blocked paying its debt while the IO device is idle.
2034  *
2035  * The following protects against such cases. If the device has been
2036  * sufficiently idle for a while, the debts are halved and delays are
2037  * recalculated.
2038  */
2039 static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
2040 			      struct ioc_now *now)
2041 {
2042 	struct ioc_gq *iocg;
2043 	u64 dur, usage_pct, nr_cycles;
2044 
2045 	/* if no debtor, reset the cycle */
2046 	if (!nr_debtors) {
2047 		ioc->dfgv_period_at = now->now;
2048 		ioc->dfgv_period_rem = 0;
2049 		ioc->dfgv_usage_us_sum = 0;
2050 		return;
2051 	}
2052 
2053 	/*
2054 	 * Debtors can pass through a lot of writes choking the device and we
2055 	 * don't want to be forgiving debts while the device is struggling from
2056 	 * write bursts. If we're missing latency targets, consider the device
2057 	 * fully utilized.
2058 	 */
2059 	if (ioc->busy_level > 0)
2060 		usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2061 
2062 	ioc->dfgv_usage_us_sum += usage_us_sum;
2063 	if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2064 		return;
2065 
2066 	/*
2067 	 * At least DFGV_PERIOD has passed since the last period. Calculate the
2068 	 * average usage and reset the period counters.
2069 	 */
2070 	dur = now->now - ioc->dfgv_period_at;
2071 	usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2072 
2073 	ioc->dfgv_period_at = now->now;
2074 	ioc->dfgv_usage_us_sum = 0;
2075 
2076 	/* if was too busy, reset everything */
2077 	if (usage_pct > DFGV_USAGE_PCT) {
2078 		ioc->dfgv_period_rem = 0;
2079 		return;
2080 	}
2081 
2082 	/*
2083 	 * Usage is lower than threshold. Let's forgive some debts. Debt
2084 	 * forgiveness runs off of the usual ioc timer but its period usually
2085 	 * doesn't match ioc's. Compensate the difference by performing the
2086 	 * reduction as many times as would fit in the duration since the last
2087 	 * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2088 	 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2089 	 * reductions is doubled.
2090 	 */
2091 	nr_cycles = dur + ioc->dfgv_period_rem;
2092 	ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2093 
2094 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2095 		u64 __maybe_unused old_debt, __maybe_unused old_delay;
2096 
2097 		if (!iocg->abs_vdebt && !iocg->delay)
2098 			continue;
2099 
2100 		spin_lock(&iocg->waitq.lock);
2101 
2102 		old_debt = iocg->abs_vdebt;
2103 		old_delay = iocg->delay;
2104 
2105 		if (iocg->abs_vdebt)
2106 			iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2107 		if (iocg->delay)
2108 			iocg->delay = iocg->delay >> nr_cycles ?: 1;
2109 
2110 		iocg_kick_waitq(iocg, true, now);
2111 
2112 		TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2113 				old_debt, iocg->abs_vdebt,
2114 				old_delay, iocg->delay);
2115 
2116 		spin_unlock(&iocg->waitq.lock);
2117 	}
2118 }
2119 
2120 /*
2121  * Check the active iocgs' state to avoid oversleeping and deactive
2122  * idle iocgs.
2123  *
2124  * Since waiters determine the sleep durations based on the vrate
2125  * they saw at the time of sleep, if vrate has increased, some
2126  * waiters could be sleeping for too long. Wake up tardy waiters
2127  * which should have woken up in the last period and expire idle
2128  * iocgs.
2129  */
2130 static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
2131 {
2132 	int nr_debtors = 0;
2133 	struct ioc_gq *iocg, *tiocg;
2134 
2135 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2136 		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2137 		    !iocg->delay && !iocg_is_idle(iocg))
2138 			continue;
2139 
2140 		spin_lock(&iocg->waitq.lock);
2141 
2142 		/* flush wait and indebt stat deltas */
2143 		if (iocg->wait_since) {
2144 			iocg->local_stat.wait_us += now->now - iocg->wait_since;
2145 			iocg->wait_since = now->now;
2146 		}
2147 		if (iocg->indebt_since) {
2148 			iocg->local_stat.indebt_us +=
2149 				now->now - iocg->indebt_since;
2150 			iocg->indebt_since = now->now;
2151 		}
2152 		if (iocg->indelay_since) {
2153 			iocg->local_stat.indelay_us +=
2154 				now->now - iocg->indelay_since;
2155 			iocg->indelay_since = now->now;
2156 		}
2157 
2158 		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2159 		    iocg->delay) {
2160 			/* might be oversleeping vtime / hweight changes, kick */
2161 			iocg_kick_waitq(iocg, true, now);
2162 			if (iocg->abs_vdebt || iocg->delay)
2163 				nr_debtors++;
2164 		} else if (iocg_is_idle(iocg)) {
2165 			/* no waiter and idle, deactivate */
2166 			u64 vtime = atomic64_read(&iocg->vtime);
2167 			s64 excess;
2168 
2169 			/*
2170 			 * @iocg has been inactive for a full duration and will
2171 			 * have a high budget. Account anything above target as
2172 			 * error and throw away. On reactivation, it'll start
2173 			 * with the target budget.
2174 			 */
2175 			excess = now->vnow - vtime - ioc->margins.target;
2176 			if (excess > 0) {
2177 				u32 old_hwi;
2178 
2179 				current_hweight(iocg, NULL, &old_hwi);
2180 				ioc->vtime_err -= div64_u64(excess * old_hwi,
2181 							    WEIGHT_ONE);
2182 			}
2183 
2184 			TRACE_IOCG_PATH(iocg_idle, iocg, now,
2185 					atomic64_read(&iocg->active_period),
2186 					atomic64_read(&ioc->cur_period), vtime);
2187 			__propagate_weights(iocg, 0, 0, false, now);
2188 			list_del_init(&iocg->active_list);
2189 		}
2190 
2191 		spin_unlock(&iocg->waitq.lock);
2192 	}
2193 
2194 	commit_weights(ioc);
2195 	return nr_debtors;
2196 }
2197 
2198 static void ioc_timer_fn(struct timer_list *timer)
2199 {
2200 	struct ioc *ioc = container_of(timer, struct ioc, timer);
2201 	struct ioc_gq *iocg, *tiocg;
2202 	struct ioc_now now;
2203 	LIST_HEAD(surpluses);
2204 	int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2205 	u64 usage_us_sum = 0;
2206 	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2207 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2208 	u32 missed_ppm[2], rq_wait_pct;
2209 	u64 period_vtime;
2210 	int prev_busy_level;
2211 
2212 	/* how were the latencies during the period? */
2213 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2214 
2215 	/* take care of active iocgs */
2216 	spin_lock_irq(&ioc->lock);
2217 
2218 	ioc_now(ioc, &now);
2219 
2220 	period_vtime = now.vnow - ioc->period_at_vtime;
2221 	if (WARN_ON_ONCE(!period_vtime)) {
2222 		spin_unlock_irq(&ioc->lock);
2223 		return;
2224 	}
2225 
2226 	nr_debtors = ioc_check_iocgs(ioc, &now);
2227 
2228 	/*
2229 	 * Wait and indebt stat are flushed above and the donation calculation
2230 	 * below needs updated usage stat. Let's bring stat up-to-date.
2231 	 */
2232 	iocg_flush_stat(&ioc->active_iocgs, &now);
2233 
2234 	/* calc usage and see whether some weights need to be moved around */
2235 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2236 		u64 vdone, vtime, usage_us;
2237 		u32 hw_active, hw_inuse;
2238 
2239 		/*
2240 		 * Collect unused and wind vtime closer to vnow to prevent
2241 		 * iocgs from accumulating a large amount of budget.
2242 		 */
2243 		vdone = atomic64_read(&iocg->done_vtime);
2244 		vtime = atomic64_read(&iocg->vtime);
2245 		current_hweight(iocg, &hw_active, &hw_inuse);
2246 
2247 		/*
2248 		 * Latency QoS detection doesn't account for IOs which are
2249 		 * in-flight for longer than a period.  Detect them by
2250 		 * comparing vdone against period start.  If lagging behind
2251 		 * IOs from past periods, don't increase vrate.
2252 		 */
2253 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2254 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2255 		    time_after64(vtime, vdone) &&
2256 		    time_after64(vtime, now.vnow -
2257 				 MAX_LAGGING_PERIODS * period_vtime) &&
2258 		    time_before64(vdone, now.vnow - period_vtime))
2259 			nr_lagging++;
2260 
2261 		/*
2262 		 * Determine absolute usage factoring in in-flight IOs to avoid
2263 		 * high-latency completions appearing as idle.
2264 		 */
2265 		usage_us = iocg->usage_delta_us;
2266 		usage_us_sum += usage_us;
2267 
2268 		/* see whether there's surplus vtime */
2269 		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2270 		if (hw_inuse < hw_active ||
2271 		    (!waitqueue_active(&iocg->waitq) &&
2272 		     time_before64(vtime, now.vnow - ioc->margins.low))) {
2273 			u32 hwa, old_hwi, hwm, new_hwi, usage;
2274 			u64 usage_dur;
2275 
2276 			if (vdone != vtime) {
2277 				u64 inflight_us = DIV64_U64_ROUND_UP(
2278 					cost_to_abs_cost(vtime - vdone, hw_inuse),
2279 					ioc->vtime_base_rate);
2280 
2281 				usage_us = max(usage_us, inflight_us);
2282 			}
2283 
2284 			/* convert to hweight based usage ratio */
2285 			if (time_after64(iocg->activated_at, ioc->period_at))
2286 				usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2287 			else
2288 				usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2289 
2290 			usage = clamp_t(u32,
2291 				DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2292 						   usage_dur),
2293 				1, WEIGHT_ONE);
2294 
2295 			/*
2296 			 * Already donating or accumulated enough to start.
2297 			 * Determine the donation amount.
2298 			 */
2299 			current_hweight(iocg, &hwa, &old_hwi);
2300 			hwm = current_hweight_max(iocg);
2301 			new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2302 							 usage, &now);
2303 			if (new_hwi < hwm) {
2304 				iocg->hweight_donating = hwa;
2305 				iocg->hweight_after_donation = new_hwi;
2306 				list_add(&iocg->surplus_list, &surpluses);
2307 			} else {
2308 				TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2309 						iocg->inuse, iocg->active,
2310 						iocg->hweight_inuse, new_hwi);
2311 
2312 				__propagate_weights(iocg, iocg->active,
2313 						    iocg->active, true, &now);
2314 				nr_shortages++;
2315 			}
2316 		} else {
2317 			/* genuinely short on vtime */
2318 			nr_shortages++;
2319 		}
2320 	}
2321 
2322 	if (!list_empty(&surpluses) && nr_shortages)
2323 		transfer_surpluses(&surpluses, &now);
2324 
2325 	commit_weights(ioc);
2326 
2327 	/* surplus list should be dissolved after use */
2328 	list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2329 		list_del_init(&iocg->surplus_list);
2330 
2331 	/*
2332 	 * If q is getting clogged or we're missing too much, we're issuing
2333 	 * too much IO and should lower vtime rate.  If we're not missing
2334 	 * and experiencing shortages but not surpluses, we're too stingy
2335 	 * and should increase vtime rate.
2336 	 */
2337 	prev_busy_level = ioc->busy_level;
2338 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2339 	    missed_ppm[READ] > ppm_rthr ||
2340 	    missed_ppm[WRITE] > ppm_wthr) {
2341 		/* clearly missing QoS targets, slow down vrate */
2342 		ioc->busy_level = max(ioc->busy_level, 0);
2343 		ioc->busy_level++;
2344 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2345 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2346 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2347 		/* QoS targets are being met with >25% margin */
2348 		if (nr_shortages) {
2349 			/*
2350 			 * We're throttling while the device has spare
2351 			 * capacity.  If vrate was being slowed down, stop.
2352 			 */
2353 			ioc->busy_level = min(ioc->busy_level, 0);
2354 
2355 			/*
2356 			 * If there are IOs spanning multiple periods, wait
2357 			 * them out before pushing the device harder.
2358 			 */
2359 			if (!nr_lagging)
2360 				ioc->busy_level--;
2361 		} else {
2362 			/*
2363 			 * Nobody is being throttled and the users aren't
2364 			 * issuing enough IOs to saturate the device.  We
2365 			 * simply don't know how close the device is to
2366 			 * saturation.  Coast.
2367 			 */
2368 			ioc->busy_level = 0;
2369 		}
2370 	} else {
2371 		/* inside the hysterisis margin, we're good */
2372 		ioc->busy_level = 0;
2373 	}
2374 
2375 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2376 
2377 	ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2378 			      prev_busy_level, missed_ppm);
2379 
2380 	ioc_refresh_params(ioc, false);
2381 
2382 	ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2383 
2384 	/*
2385 	 * This period is done.  Move onto the next one.  If nothing's
2386 	 * going on with the device, stop the timer.
2387 	 */
2388 	atomic64_inc(&ioc->cur_period);
2389 
2390 	if (ioc->running != IOC_STOP) {
2391 		if (!list_empty(&ioc->active_iocgs)) {
2392 			ioc_start_period(ioc, &now);
2393 		} else {
2394 			ioc->busy_level = 0;
2395 			ioc->vtime_err = 0;
2396 			ioc->running = IOC_IDLE;
2397 		}
2398 
2399 		ioc_refresh_vrate(ioc, &now);
2400 	}
2401 
2402 	spin_unlock_irq(&ioc->lock);
2403 }
2404 
2405 static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2406 				      u64 abs_cost, struct ioc_now *now)
2407 {
2408 	struct ioc *ioc = iocg->ioc;
2409 	struct ioc_margins *margins = &ioc->margins;
2410 	u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2411 	u32 hwi, adj_step;
2412 	s64 margin;
2413 	u64 cost, new_inuse;
2414 
2415 	current_hweight(iocg, NULL, &hwi);
2416 	old_hwi = hwi;
2417 	cost = abs_cost_to_cost(abs_cost, hwi);
2418 	margin = now->vnow - vtime - cost;
2419 
2420 	/* debt handling owns inuse for debtors */
2421 	if (iocg->abs_vdebt)
2422 		return cost;
2423 
2424 	/*
2425 	 * We only increase inuse during period and do so if the margin has
2426 	 * deteriorated since the previous adjustment.
2427 	 */
2428 	if (margin >= iocg->saved_margin || margin >= margins->low ||
2429 	    iocg->inuse == iocg->active)
2430 		return cost;
2431 
2432 	spin_lock_irq(&ioc->lock);
2433 
2434 	/* we own inuse only when @iocg is in the normal active state */
2435 	if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2436 		spin_unlock_irq(&ioc->lock);
2437 		return cost;
2438 	}
2439 
2440 	/*
2441 	 * Bump up inuse till @abs_cost fits in the existing budget.
2442 	 * adj_step must be determined after acquiring ioc->lock - we might
2443 	 * have raced and lost to another thread for activation and could
2444 	 * be reading 0 iocg->active before ioc->lock which will lead to
2445 	 * infinite loop.
2446 	 */
2447 	new_inuse = iocg->inuse;
2448 	adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2449 	do {
2450 		new_inuse = new_inuse + adj_step;
2451 		propagate_weights(iocg, iocg->active, new_inuse, true, now);
2452 		current_hweight(iocg, NULL, &hwi);
2453 		cost = abs_cost_to_cost(abs_cost, hwi);
2454 	} while (time_after64(vtime + cost, now->vnow) &&
2455 		 iocg->inuse != iocg->active);
2456 
2457 	spin_unlock_irq(&ioc->lock);
2458 
2459 	TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2460 			old_inuse, iocg->inuse, old_hwi, hwi);
2461 
2462 	return cost;
2463 }
2464 
2465 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2466 				    bool is_merge, u64 *costp)
2467 {
2468 	struct ioc *ioc = iocg->ioc;
2469 	u64 coef_seqio, coef_randio, coef_page;
2470 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2471 	u64 seek_pages = 0;
2472 	u64 cost = 0;
2473 
2474 	switch (bio_op(bio)) {
2475 	case REQ_OP_READ:
2476 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
2477 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
2478 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
2479 		break;
2480 	case REQ_OP_WRITE:
2481 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
2482 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
2483 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
2484 		break;
2485 	default:
2486 		goto out;
2487 	}
2488 
2489 	if (iocg->cursor) {
2490 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2491 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2492 	}
2493 
2494 	if (!is_merge) {
2495 		if (seek_pages > LCOEF_RANDIO_PAGES) {
2496 			cost += coef_randio;
2497 		} else {
2498 			cost += coef_seqio;
2499 		}
2500 	}
2501 	cost += pages * coef_page;
2502 out:
2503 	*costp = cost;
2504 }
2505 
2506 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2507 {
2508 	u64 cost;
2509 
2510 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2511 	return cost;
2512 }
2513 
2514 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2515 					 u64 *costp)
2516 {
2517 	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2518 
2519 	switch (req_op(rq)) {
2520 	case REQ_OP_READ:
2521 		*costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2522 		break;
2523 	case REQ_OP_WRITE:
2524 		*costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2525 		break;
2526 	default:
2527 		*costp = 0;
2528 	}
2529 }
2530 
2531 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2532 {
2533 	u64 cost;
2534 
2535 	calc_size_vtime_cost_builtin(rq, ioc, &cost);
2536 	return cost;
2537 }
2538 
2539 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2540 {
2541 	struct blkcg_gq *blkg = bio->bi_blkg;
2542 	struct ioc *ioc = rqos_to_ioc(rqos);
2543 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
2544 	struct ioc_now now;
2545 	struct iocg_wait wait;
2546 	u64 abs_cost, cost, vtime;
2547 	bool use_debt, ioc_locked;
2548 	unsigned long flags;
2549 
2550 	/* bypass IOs if disabled, still initializing, or for root cgroup */
2551 	if (!ioc->enabled || !iocg || !iocg->level)
2552 		return;
2553 
2554 	/* calculate the absolute vtime cost */
2555 	abs_cost = calc_vtime_cost(bio, iocg, false);
2556 	if (!abs_cost)
2557 		return;
2558 
2559 	if (!iocg_activate(iocg, &now))
2560 		return;
2561 
2562 	iocg->cursor = bio_end_sector(bio);
2563 	vtime = atomic64_read(&iocg->vtime);
2564 	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2565 
2566 	/*
2567 	 * If no one's waiting and within budget, issue right away.  The
2568 	 * tests are racy but the races aren't systemic - we only miss once
2569 	 * in a while which is fine.
2570 	 */
2571 	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2572 	    time_before_eq64(vtime + cost, now.vnow)) {
2573 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2574 		return;
2575 	}
2576 
2577 	/*
2578 	 * We're over budget. This can be handled in two ways. IOs which may
2579 	 * cause priority inversions are punted to @ioc->aux_iocg and charged as
2580 	 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2581 	 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2582 	 * whether debt handling is needed and acquire locks accordingly.
2583 	 */
2584 	use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2585 	ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2586 retry_lock:
2587 	iocg_lock(iocg, ioc_locked, &flags);
2588 
2589 	/*
2590 	 * @iocg must stay activated for debt and waitq handling. Deactivation
2591 	 * is synchronized against both ioc->lock and waitq.lock and we won't
2592 	 * get deactivated as long as we're waiting or has debt, so we're good
2593 	 * if we're activated here. In the unlikely cases that we aren't, just
2594 	 * issue the IO.
2595 	 */
2596 	if (unlikely(list_empty(&iocg->active_list))) {
2597 		iocg_unlock(iocg, ioc_locked, &flags);
2598 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2599 		return;
2600 	}
2601 
2602 	/*
2603 	 * We're over budget. If @bio has to be issued regardless, remember
2604 	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2605 	 * off the debt before waking more IOs.
2606 	 *
2607 	 * This way, the debt is continuously paid off each period with the
2608 	 * actual budget available to the cgroup. If we just wound vtime, we
2609 	 * would incorrectly use the current hw_inuse for the entire amount
2610 	 * which, for example, can lead to the cgroup staying blocked for a
2611 	 * long time even with substantially raised hw_inuse.
2612 	 *
2613 	 * An iocg with vdebt should stay online so that the timer can keep
2614 	 * deducting its vdebt and [de]activate use_delay mechanism
2615 	 * accordingly. We don't want to race against the timer trying to
2616 	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
2617 	 * penalizing the cgroup and its descendants.
2618 	 */
2619 	if (use_debt) {
2620 		iocg_incur_debt(iocg, abs_cost, &now);
2621 		if (iocg_kick_delay(iocg, &now))
2622 			blkcg_schedule_throttle(rqos->q,
2623 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2624 		iocg_unlock(iocg, ioc_locked, &flags);
2625 		return;
2626 	}
2627 
2628 	/* guarantee that iocgs w/ waiters have maximum inuse */
2629 	if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2630 		if (!ioc_locked) {
2631 			iocg_unlock(iocg, false, &flags);
2632 			ioc_locked = true;
2633 			goto retry_lock;
2634 		}
2635 		propagate_weights(iocg, iocg->active, iocg->active, true,
2636 				  &now);
2637 	}
2638 
2639 	/*
2640 	 * Append self to the waitq and schedule the wakeup timer if we're
2641 	 * the first waiter.  The timer duration is calculated based on the
2642 	 * current vrate.  vtime and hweight changes can make it too short
2643 	 * or too long.  Each wait entry records the absolute cost it's
2644 	 * waiting for to allow re-evaluation using a custom wait entry.
2645 	 *
2646 	 * If too short, the timer simply reschedules itself.  If too long,
2647 	 * the period timer will notice and trigger wakeups.
2648 	 *
2649 	 * All waiters are on iocg->waitq and the wait states are
2650 	 * synchronized using waitq.lock.
2651 	 */
2652 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2653 	wait.wait.private = current;
2654 	wait.bio = bio;
2655 	wait.abs_cost = abs_cost;
2656 	wait.committed = false;	/* will be set true by waker */
2657 
2658 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2659 	iocg_kick_waitq(iocg, ioc_locked, &now);
2660 
2661 	iocg_unlock(iocg, ioc_locked, &flags);
2662 
2663 	while (true) {
2664 		set_current_state(TASK_UNINTERRUPTIBLE);
2665 		if (wait.committed)
2666 			break;
2667 		io_schedule();
2668 	}
2669 
2670 	/* waker already committed us, proceed */
2671 	finish_wait(&iocg->waitq, &wait.wait);
2672 }
2673 
2674 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2675 			   struct bio *bio)
2676 {
2677 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2678 	struct ioc *ioc = rqos_to_ioc(rqos);
2679 	sector_t bio_end = bio_end_sector(bio);
2680 	struct ioc_now now;
2681 	u64 vtime, abs_cost, cost;
2682 	unsigned long flags;
2683 
2684 	/* bypass if disabled, still initializing, or for root cgroup */
2685 	if (!ioc->enabled || !iocg || !iocg->level)
2686 		return;
2687 
2688 	abs_cost = calc_vtime_cost(bio, iocg, true);
2689 	if (!abs_cost)
2690 		return;
2691 
2692 	ioc_now(ioc, &now);
2693 
2694 	vtime = atomic64_read(&iocg->vtime);
2695 	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2696 
2697 	/* update cursor if backmerging into the request at the cursor */
2698 	if (blk_rq_pos(rq) < bio_end &&
2699 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2700 		iocg->cursor = bio_end;
2701 
2702 	/*
2703 	 * Charge if there's enough vtime budget and the existing request has
2704 	 * cost assigned.
2705 	 */
2706 	if (rq->bio && rq->bio->bi_iocost_cost &&
2707 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2708 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2709 		return;
2710 	}
2711 
2712 	/*
2713 	 * Otherwise, account it as debt if @iocg is online, which it should
2714 	 * be for the vast majority of cases. See debt handling in
2715 	 * ioc_rqos_throttle() for details.
2716 	 */
2717 	spin_lock_irqsave(&ioc->lock, flags);
2718 	spin_lock(&iocg->waitq.lock);
2719 
2720 	if (likely(!list_empty(&iocg->active_list))) {
2721 		iocg_incur_debt(iocg, abs_cost, &now);
2722 		if (iocg_kick_delay(iocg, &now))
2723 			blkcg_schedule_throttle(rqos->q,
2724 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2725 	} else {
2726 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2727 	}
2728 
2729 	spin_unlock(&iocg->waitq.lock);
2730 	spin_unlock_irqrestore(&ioc->lock, flags);
2731 }
2732 
2733 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2734 {
2735 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2736 
2737 	if (iocg && bio->bi_iocost_cost)
2738 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2739 }
2740 
2741 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2742 {
2743 	struct ioc *ioc = rqos_to_ioc(rqos);
2744 	struct ioc_pcpu_stat *ccs;
2745 	u64 on_q_ns, rq_wait_ns, size_nsec;
2746 	int pidx, rw;
2747 
2748 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2749 		return;
2750 
2751 	switch (req_op(rq) & REQ_OP_MASK) {
2752 	case REQ_OP_READ:
2753 		pidx = QOS_RLAT;
2754 		rw = READ;
2755 		break;
2756 	case REQ_OP_WRITE:
2757 		pidx = QOS_WLAT;
2758 		rw = WRITE;
2759 		break;
2760 	default:
2761 		return;
2762 	}
2763 
2764 	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2765 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2766 	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2767 
2768 	ccs = get_cpu_ptr(ioc->pcpu_stat);
2769 
2770 	if (on_q_ns <= size_nsec ||
2771 	    on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2772 		local_inc(&ccs->missed[rw].nr_met);
2773 	else
2774 		local_inc(&ccs->missed[rw].nr_missed);
2775 
2776 	local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2777 
2778 	put_cpu_ptr(ccs);
2779 }
2780 
2781 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2782 {
2783 	struct ioc *ioc = rqos_to_ioc(rqos);
2784 
2785 	spin_lock_irq(&ioc->lock);
2786 	ioc_refresh_params(ioc, false);
2787 	spin_unlock_irq(&ioc->lock);
2788 }
2789 
2790 static void ioc_rqos_exit(struct rq_qos *rqos)
2791 {
2792 	struct ioc *ioc = rqos_to_ioc(rqos);
2793 
2794 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2795 
2796 	spin_lock_irq(&ioc->lock);
2797 	ioc->running = IOC_STOP;
2798 	spin_unlock_irq(&ioc->lock);
2799 
2800 	del_timer_sync(&ioc->timer);
2801 	free_percpu(ioc->pcpu_stat);
2802 	kfree(ioc);
2803 }
2804 
2805 static struct rq_qos_ops ioc_rqos_ops = {
2806 	.throttle = ioc_rqos_throttle,
2807 	.merge = ioc_rqos_merge,
2808 	.done_bio = ioc_rqos_done_bio,
2809 	.done = ioc_rqos_done,
2810 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
2811 	.exit = ioc_rqos_exit,
2812 };
2813 
2814 static int blk_iocost_init(struct request_queue *q)
2815 {
2816 	struct ioc *ioc;
2817 	struct rq_qos *rqos;
2818 	int i, cpu, ret;
2819 
2820 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2821 	if (!ioc)
2822 		return -ENOMEM;
2823 
2824 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2825 	if (!ioc->pcpu_stat) {
2826 		kfree(ioc);
2827 		return -ENOMEM;
2828 	}
2829 
2830 	for_each_possible_cpu(cpu) {
2831 		struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2832 
2833 		for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2834 			local_set(&ccs->missed[i].nr_met, 0);
2835 			local_set(&ccs->missed[i].nr_missed, 0);
2836 		}
2837 		local64_set(&ccs->rq_wait_ns, 0);
2838 	}
2839 
2840 	rqos = &ioc->rqos;
2841 	rqos->id = RQ_QOS_COST;
2842 	rqos->ops = &ioc_rqos_ops;
2843 	rqos->q = q;
2844 
2845 	spin_lock_init(&ioc->lock);
2846 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
2847 	INIT_LIST_HEAD(&ioc->active_iocgs);
2848 
2849 	ioc->running = IOC_IDLE;
2850 	ioc->vtime_base_rate = VTIME_PER_USEC;
2851 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2852 	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2853 	ioc->period_at = ktime_to_us(ktime_get());
2854 	atomic64_set(&ioc->cur_period, 0);
2855 	atomic_set(&ioc->hweight_gen, 0);
2856 
2857 	spin_lock_irq(&ioc->lock);
2858 	ioc->autop_idx = AUTOP_INVALID;
2859 	ioc_refresh_params(ioc, true);
2860 	spin_unlock_irq(&ioc->lock);
2861 
2862 	/*
2863 	 * rqos must be added before activation to allow iocg_pd_init() to
2864 	 * lookup the ioc from q. This means that the rqos methods may get
2865 	 * called before policy activation completion, can't assume that the
2866 	 * target bio has an iocg associated and need to test for NULL iocg.
2867 	 */
2868 	rq_qos_add(q, rqos);
2869 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2870 	if (ret) {
2871 		rq_qos_del(q, rqos);
2872 		free_percpu(ioc->pcpu_stat);
2873 		kfree(ioc);
2874 		return ret;
2875 	}
2876 	return 0;
2877 }
2878 
2879 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2880 {
2881 	struct ioc_cgrp *iocc;
2882 
2883 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2884 	if (!iocc)
2885 		return NULL;
2886 
2887 	iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2888 	return &iocc->cpd;
2889 }
2890 
2891 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2892 {
2893 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
2894 }
2895 
2896 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2897 					     struct blkcg *blkcg)
2898 {
2899 	int levels = blkcg->css.cgroup->level + 1;
2900 	struct ioc_gq *iocg;
2901 
2902 	iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2903 	if (!iocg)
2904 		return NULL;
2905 
2906 	iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2907 	if (!iocg->pcpu_stat) {
2908 		kfree(iocg);
2909 		return NULL;
2910 	}
2911 
2912 	return &iocg->pd;
2913 }
2914 
2915 static void ioc_pd_init(struct blkg_policy_data *pd)
2916 {
2917 	struct ioc_gq *iocg = pd_to_iocg(pd);
2918 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2919 	struct ioc *ioc = q_to_ioc(blkg->q);
2920 	struct ioc_now now;
2921 	struct blkcg_gq *tblkg;
2922 	unsigned long flags;
2923 
2924 	ioc_now(ioc, &now);
2925 
2926 	iocg->ioc = ioc;
2927 	atomic64_set(&iocg->vtime, now.vnow);
2928 	atomic64_set(&iocg->done_vtime, now.vnow);
2929 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2930 	INIT_LIST_HEAD(&iocg->active_list);
2931 	INIT_LIST_HEAD(&iocg->walk_list);
2932 	INIT_LIST_HEAD(&iocg->surplus_list);
2933 	iocg->hweight_active = WEIGHT_ONE;
2934 	iocg->hweight_inuse = WEIGHT_ONE;
2935 
2936 	init_waitqueue_head(&iocg->waitq);
2937 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2938 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2939 
2940 	iocg->level = blkg->blkcg->css.cgroup->level;
2941 
2942 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2943 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2944 		iocg->ancestors[tiocg->level] = tiocg;
2945 	}
2946 
2947 	spin_lock_irqsave(&ioc->lock, flags);
2948 	weight_updated(iocg, &now);
2949 	spin_unlock_irqrestore(&ioc->lock, flags);
2950 }
2951 
2952 static void ioc_pd_free(struct blkg_policy_data *pd)
2953 {
2954 	struct ioc_gq *iocg = pd_to_iocg(pd);
2955 	struct ioc *ioc = iocg->ioc;
2956 	unsigned long flags;
2957 
2958 	if (ioc) {
2959 		spin_lock_irqsave(&ioc->lock, flags);
2960 
2961 		if (!list_empty(&iocg->active_list)) {
2962 			struct ioc_now now;
2963 
2964 			ioc_now(ioc, &now);
2965 			propagate_weights(iocg, 0, 0, false, &now);
2966 			list_del_init(&iocg->active_list);
2967 		}
2968 
2969 		WARN_ON_ONCE(!list_empty(&iocg->walk_list));
2970 		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2971 
2972 		spin_unlock_irqrestore(&ioc->lock, flags);
2973 
2974 		hrtimer_cancel(&iocg->waitq_timer);
2975 	}
2976 	free_percpu(iocg->pcpu_stat);
2977 	kfree(iocg);
2978 }
2979 
2980 static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
2981 {
2982 	struct ioc_gq *iocg = pd_to_iocg(pd);
2983 	struct ioc *ioc = iocg->ioc;
2984 	size_t pos = 0;
2985 
2986 	if (!ioc->enabled)
2987 		return 0;
2988 
2989 	if (iocg->level == 0) {
2990 		unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
2991 			ioc->vtime_base_rate * 10000,
2992 			VTIME_PER_USEC);
2993 		pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
2994 				  vp10k / 100, vp10k % 100);
2995 	}
2996 
2997 	pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
2998 			 iocg->last_stat.usage_us);
2999 
3000 	if (blkcg_debug_stats)
3001 		pos += scnprintf(buf + pos, size - pos,
3002 				 " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3003 				 iocg->last_stat.wait_us,
3004 				 iocg->last_stat.indebt_us,
3005 				 iocg->last_stat.indelay_us);
3006 
3007 	return pos;
3008 }
3009 
3010 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3011 			     int off)
3012 {
3013 	const char *dname = blkg_dev_name(pd->blkg);
3014 	struct ioc_gq *iocg = pd_to_iocg(pd);
3015 
3016 	if (dname && iocg->cfg_weight)
3017 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
3018 	return 0;
3019 }
3020 
3021 
3022 static int ioc_weight_show(struct seq_file *sf, void *v)
3023 {
3024 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3025 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3026 
3027 	seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3028 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3029 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3030 	return 0;
3031 }
3032 
3033 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3034 				size_t nbytes, loff_t off)
3035 {
3036 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
3037 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3038 	struct blkg_conf_ctx ctx;
3039 	struct ioc_now now;
3040 	struct ioc_gq *iocg;
3041 	u32 v;
3042 	int ret;
3043 
3044 	if (!strchr(buf, ':')) {
3045 		struct blkcg_gq *blkg;
3046 
3047 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3048 			return -EINVAL;
3049 
3050 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3051 			return -EINVAL;
3052 
3053 		spin_lock(&blkcg->lock);
3054 		iocc->dfl_weight = v * WEIGHT_ONE;
3055 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3056 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
3057 
3058 			if (iocg) {
3059 				spin_lock_irq(&iocg->ioc->lock);
3060 				ioc_now(iocg->ioc, &now);
3061 				weight_updated(iocg, &now);
3062 				spin_unlock_irq(&iocg->ioc->lock);
3063 			}
3064 		}
3065 		spin_unlock(&blkcg->lock);
3066 
3067 		return nbytes;
3068 	}
3069 
3070 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
3071 	if (ret)
3072 		return ret;
3073 
3074 	iocg = blkg_to_iocg(ctx.blkg);
3075 
3076 	if (!strncmp(ctx.body, "default", 7)) {
3077 		v = 0;
3078 	} else {
3079 		if (!sscanf(ctx.body, "%u", &v))
3080 			goto einval;
3081 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3082 			goto einval;
3083 	}
3084 
3085 	spin_lock(&iocg->ioc->lock);
3086 	iocg->cfg_weight = v * WEIGHT_ONE;
3087 	ioc_now(iocg->ioc, &now);
3088 	weight_updated(iocg, &now);
3089 	spin_unlock(&iocg->ioc->lock);
3090 
3091 	blkg_conf_finish(&ctx);
3092 	return nbytes;
3093 
3094 einval:
3095 	blkg_conf_finish(&ctx);
3096 	return -EINVAL;
3097 }
3098 
3099 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3100 			  int off)
3101 {
3102 	const char *dname = blkg_dev_name(pd->blkg);
3103 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
3104 
3105 	if (!dname)
3106 		return 0;
3107 
3108 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3109 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3110 		   ioc->params.qos[QOS_RPPM] / 10000,
3111 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
3112 		   ioc->params.qos[QOS_RLAT],
3113 		   ioc->params.qos[QOS_WPPM] / 10000,
3114 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
3115 		   ioc->params.qos[QOS_WLAT],
3116 		   ioc->params.qos[QOS_MIN] / 10000,
3117 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
3118 		   ioc->params.qos[QOS_MAX] / 10000,
3119 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
3120 	return 0;
3121 }
3122 
3123 static int ioc_qos_show(struct seq_file *sf, void *v)
3124 {
3125 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3126 
3127 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3128 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3129 	return 0;
3130 }
3131 
3132 static const match_table_t qos_ctrl_tokens = {
3133 	{ QOS_ENABLE,		"enable=%u"	},
3134 	{ QOS_CTRL,		"ctrl=%s"	},
3135 	{ NR_QOS_CTRL_PARAMS,	NULL		},
3136 };
3137 
3138 static const match_table_t qos_tokens = {
3139 	{ QOS_RPPM,		"rpct=%s"	},
3140 	{ QOS_RLAT,		"rlat=%u"	},
3141 	{ QOS_WPPM,		"wpct=%s"	},
3142 	{ QOS_WLAT,		"wlat=%u"	},
3143 	{ QOS_MIN,		"min=%s"	},
3144 	{ QOS_MAX,		"max=%s"	},
3145 	{ NR_QOS_PARAMS,	NULL		},
3146 };
3147 
3148 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3149 			     size_t nbytes, loff_t off)
3150 {
3151 	struct block_device *bdev;
3152 	struct ioc *ioc;
3153 	u32 qos[NR_QOS_PARAMS];
3154 	bool enable, user;
3155 	char *p;
3156 	int ret;
3157 
3158 	bdev = blkcg_conf_open_bdev(&input);
3159 	if (IS_ERR(bdev))
3160 		return PTR_ERR(bdev);
3161 
3162 	ioc = q_to_ioc(bdev->bd_disk->queue);
3163 	if (!ioc) {
3164 		ret = blk_iocost_init(bdev->bd_disk->queue);
3165 		if (ret)
3166 			goto err;
3167 		ioc = q_to_ioc(bdev->bd_disk->queue);
3168 	}
3169 
3170 	spin_lock_irq(&ioc->lock);
3171 	memcpy(qos, ioc->params.qos, sizeof(qos));
3172 	enable = ioc->enabled;
3173 	user = ioc->user_qos_params;
3174 	spin_unlock_irq(&ioc->lock);
3175 
3176 	while ((p = strsep(&input, " \t\n"))) {
3177 		substring_t args[MAX_OPT_ARGS];
3178 		char buf[32];
3179 		int tok;
3180 		s64 v;
3181 
3182 		if (!*p)
3183 			continue;
3184 
3185 		switch (match_token(p, qos_ctrl_tokens, args)) {
3186 		case QOS_ENABLE:
3187 			match_u64(&args[0], &v);
3188 			enable = v;
3189 			continue;
3190 		case QOS_CTRL:
3191 			match_strlcpy(buf, &args[0], sizeof(buf));
3192 			if (!strcmp(buf, "auto"))
3193 				user = false;
3194 			else if (!strcmp(buf, "user"))
3195 				user = true;
3196 			else
3197 				goto einval;
3198 			continue;
3199 		}
3200 
3201 		tok = match_token(p, qos_tokens, args);
3202 		switch (tok) {
3203 		case QOS_RPPM:
3204 		case QOS_WPPM:
3205 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3206 			    sizeof(buf))
3207 				goto einval;
3208 			if (cgroup_parse_float(buf, 2, &v))
3209 				goto einval;
3210 			if (v < 0 || v > 10000)
3211 				goto einval;
3212 			qos[tok] = v * 100;
3213 			break;
3214 		case QOS_RLAT:
3215 		case QOS_WLAT:
3216 			if (match_u64(&args[0], &v))
3217 				goto einval;
3218 			qos[tok] = v;
3219 			break;
3220 		case QOS_MIN:
3221 		case QOS_MAX:
3222 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3223 			    sizeof(buf))
3224 				goto einval;
3225 			if (cgroup_parse_float(buf, 2, &v))
3226 				goto einval;
3227 			if (v < 0)
3228 				goto einval;
3229 			qos[tok] = clamp_t(s64, v * 100,
3230 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
3231 			break;
3232 		default:
3233 			goto einval;
3234 		}
3235 		user = true;
3236 	}
3237 
3238 	if (qos[QOS_MIN] > qos[QOS_MAX])
3239 		goto einval;
3240 
3241 	spin_lock_irq(&ioc->lock);
3242 
3243 	if (enable) {
3244 		blk_stat_enable_accounting(ioc->rqos.q);
3245 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3246 		ioc->enabled = true;
3247 	} else {
3248 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3249 		ioc->enabled = false;
3250 	}
3251 
3252 	if (user) {
3253 		memcpy(ioc->params.qos, qos, sizeof(qos));
3254 		ioc->user_qos_params = true;
3255 	} else {
3256 		ioc->user_qos_params = false;
3257 	}
3258 
3259 	ioc_refresh_params(ioc, true);
3260 	spin_unlock_irq(&ioc->lock);
3261 
3262 	blkdev_put_no_open(bdev);
3263 	return nbytes;
3264 einval:
3265 	ret = -EINVAL;
3266 err:
3267 	blkdev_put_no_open(bdev);
3268 	return ret;
3269 }
3270 
3271 static u64 ioc_cost_model_prfill(struct seq_file *sf,
3272 				 struct blkg_policy_data *pd, int off)
3273 {
3274 	const char *dname = blkg_dev_name(pd->blkg);
3275 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
3276 	u64 *u = ioc->params.i_lcoefs;
3277 
3278 	if (!dname)
3279 		return 0;
3280 
3281 	seq_printf(sf, "%s ctrl=%s model=linear "
3282 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
3283 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3284 		   dname, ioc->user_cost_model ? "user" : "auto",
3285 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3286 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3287 	return 0;
3288 }
3289 
3290 static int ioc_cost_model_show(struct seq_file *sf, void *v)
3291 {
3292 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3293 
3294 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3295 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3296 	return 0;
3297 }
3298 
3299 static const match_table_t cost_ctrl_tokens = {
3300 	{ COST_CTRL,		"ctrl=%s"	},
3301 	{ COST_MODEL,		"model=%s"	},
3302 	{ NR_COST_CTRL_PARAMS,	NULL		},
3303 };
3304 
3305 static const match_table_t i_lcoef_tokens = {
3306 	{ I_LCOEF_RBPS,		"rbps=%u"	},
3307 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
3308 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
3309 	{ I_LCOEF_WBPS,		"wbps=%u"	},
3310 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
3311 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
3312 	{ NR_I_LCOEFS,		NULL		},
3313 };
3314 
3315 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3316 				    size_t nbytes, loff_t off)
3317 {
3318 	struct block_device *bdev;
3319 	struct ioc *ioc;
3320 	u64 u[NR_I_LCOEFS];
3321 	bool user;
3322 	char *p;
3323 	int ret;
3324 
3325 	bdev = blkcg_conf_open_bdev(&input);
3326 	if (IS_ERR(bdev))
3327 		return PTR_ERR(bdev);
3328 
3329 	ioc = q_to_ioc(bdev->bd_disk->queue);
3330 	if (!ioc) {
3331 		ret = blk_iocost_init(bdev->bd_disk->queue);
3332 		if (ret)
3333 			goto err;
3334 		ioc = q_to_ioc(bdev->bd_disk->queue);
3335 	}
3336 
3337 	spin_lock_irq(&ioc->lock);
3338 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3339 	user = ioc->user_cost_model;
3340 	spin_unlock_irq(&ioc->lock);
3341 
3342 	while ((p = strsep(&input, " \t\n"))) {
3343 		substring_t args[MAX_OPT_ARGS];
3344 		char buf[32];
3345 		int tok;
3346 		u64 v;
3347 
3348 		if (!*p)
3349 			continue;
3350 
3351 		switch (match_token(p, cost_ctrl_tokens, args)) {
3352 		case COST_CTRL:
3353 			match_strlcpy(buf, &args[0], sizeof(buf));
3354 			if (!strcmp(buf, "auto"))
3355 				user = false;
3356 			else if (!strcmp(buf, "user"))
3357 				user = true;
3358 			else
3359 				goto einval;
3360 			continue;
3361 		case COST_MODEL:
3362 			match_strlcpy(buf, &args[0], sizeof(buf));
3363 			if (strcmp(buf, "linear"))
3364 				goto einval;
3365 			continue;
3366 		}
3367 
3368 		tok = match_token(p, i_lcoef_tokens, args);
3369 		if (tok == NR_I_LCOEFS)
3370 			goto einval;
3371 		if (match_u64(&args[0], &v))
3372 			goto einval;
3373 		u[tok] = v;
3374 		user = true;
3375 	}
3376 
3377 	spin_lock_irq(&ioc->lock);
3378 	if (user) {
3379 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3380 		ioc->user_cost_model = true;
3381 	} else {
3382 		ioc->user_cost_model = false;
3383 	}
3384 	ioc_refresh_params(ioc, true);
3385 	spin_unlock_irq(&ioc->lock);
3386 
3387 	blkdev_put_no_open(bdev);
3388 	return nbytes;
3389 
3390 einval:
3391 	ret = -EINVAL;
3392 err:
3393 	blkdev_put_no_open(bdev);
3394 	return ret;
3395 }
3396 
3397 static struct cftype ioc_files[] = {
3398 	{
3399 		.name = "weight",
3400 		.flags = CFTYPE_NOT_ON_ROOT,
3401 		.seq_show = ioc_weight_show,
3402 		.write = ioc_weight_write,
3403 	},
3404 	{
3405 		.name = "cost.qos",
3406 		.flags = CFTYPE_ONLY_ON_ROOT,
3407 		.seq_show = ioc_qos_show,
3408 		.write = ioc_qos_write,
3409 	},
3410 	{
3411 		.name = "cost.model",
3412 		.flags = CFTYPE_ONLY_ON_ROOT,
3413 		.seq_show = ioc_cost_model_show,
3414 		.write = ioc_cost_model_write,
3415 	},
3416 	{}
3417 };
3418 
3419 static struct blkcg_policy blkcg_policy_iocost = {
3420 	.dfl_cftypes	= ioc_files,
3421 	.cpd_alloc_fn	= ioc_cpd_alloc,
3422 	.cpd_free_fn	= ioc_cpd_free,
3423 	.pd_alloc_fn	= ioc_pd_alloc,
3424 	.pd_init_fn	= ioc_pd_init,
3425 	.pd_free_fn	= ioc_pd_free,
3426 	.pd_stat_fn	= ioc_pd_stat,
3427 };
3428 
3429 static int __init ioc_init(void)
3430 {
3431 	return blkcg_policy_register(&blkcg_policy_iocost);
3432 }
3433 
3434 static void __exit ioc_exit(void)
3435 {
3436 	blkcg_policy_unregister(&blkcg_policy_iocost);
3437 }
3438 
3439 module_init(ioc_init);
3440 module_exit(ioc_exit);
3441