xref: /linux/net/sched/sch_dualpi2.c (revision 05ed733b65ab977dd931e7f7ac0f62fdb81205c2)
1 // SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
2 /* Copyright (C) 2024 Nokia
3  *
4  * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
5  * Author: Olga Albisser <olga@albisser.org>
6  * Author: Henrik Steen <henrist@henrist.net>
7  * Author: Olivier Tilmans <olivier.tilmans@nokia.com>
8  * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
9  *
10  * DualPI Improved with a Square (dualpi2):
11  * - Supports congestion controls that comply with the Prague requirements
12  *   in RFC9331 (e.g. TCP-Prague)
13  * - Supports coupled dual-queue with PI2 as defined in RFC9332
14  * - Supports ECN L4S-identifier (IP.ECN==0b*1)
15  *
16  * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,
17  *   they do not meet the 'Prague L4S Requirements' listed in RFC 9331
18  *   Section 4, so they can only be used with DualPI2 in a datacenter
19  *   context.
20  *
21  * References:
22  * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332
23  * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and
24  *   scalable TCP."  in proc. ACM CoNEXT'16, 2016.
25  */
26 
27 #include <linux/errno.h>
28 #include <linux/hrtimer.h>
29 #include <linux/if_vlan.h>
30 #include <linux/kernel.h>
31 #include <linux/limits.h>
32 #include <linux/module.h>
33 #include <linux/skbuff.h>
34 #include <linux/types.h>
35 
36 #include <net/gso.h>
37 #include <net/inet_ecn.h>
38 #include <net/pkt_cls.h>
39 #include <net/pkt_sched.h>
40 
41 /* 32b enable to support flows with windows up to ~8.6 * 1e9 packets
42  * i.e., twice the maximal snd_cwnd.
43  * MAX_PROB must be consistent with the RNG in dualpi2_roll().
44  */
45 #define MAX_PROB U32_MAX
46 
47 /* alpha/beta values exchanged over netlink are in units of 256ns */
48 #define ALPHA_BETA_SHIFT 8
49 
50 /* Scaled values of alpha/beta must fit in 32b to avoid overflow in later
51  * computations. Consequently (see and dualpi2_scale_alpha_beta()), their
52  * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1
53  * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to
54  * control flows whose maximal RTTs can be in usec up to few secs.
55  */
56 #define ALPHA_BETA_MAX ((1U << 31) - 1)
57 
58 /* Internal alpha/beta are in units of 64ns.
59  * This enables to use all alpha/beta values in the allowed range without loss
60  * of precision due to rounding when scaling them internally, e.g.,
61  * scale_alpha_beta(1) will not round down to 0.
62  */
63 #define ALPHA_BETA_GRANULARITY 6
64 
65 #define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
66 
67 /* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */
68 #define MAX_WC 100
69 
70 struct dualpi2_sched_data {
71 	struct Qdisc *l_queue;	/* The L4S Low latency queue (L-queue) */
72 	struct Qdisc *sch;	/* The Classic queue (C-queue) */
73 
74 	/* Registered tc filters */
75 	struct tcf_proto __rcu *tcf_filters;
76 	struct tcf_block *tcf_block;
77 
78 	/* PI2 parameters */
79 	u64	pi2_target;	/* Target delay in nanoseconds */
80 	u32	pi2_tupdate;	/* Timer frequency in nanoseconds */
81 	u32	pi2_prob;	/* Base PI probability */
82 	u32	pi2_alpha;	/* Gain factor for the integral rate response */
83 	u32	pi2_beta;	/* Gain factor for the proportional response */
84 	struct hrtimer pi2_timer; /* prob update timer */
85 
86 	/* Step AQM (L-queue only) parameters */
87 	u32	step_thresh;	/* Step threshold */
88 	bool	step_in_packets; /* Step thresh in packets (1) or time (0) */
89 
90 	/* C-queue starvation protection */
91 	s32	c_protection_credit; /* Credit (sign indicates which queue) */
92 	s32	c_protection_init; /* Reset value of the credit */
93 	u8	c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */
94 	u8	c_protection_wl; /* L-queue weight (MAX_WC - wc) */
95 
96 	/* General dualQ parameters */
97 	u32	memory_limit;	/* Memory limit of both queues */
98 	u8	coupling_factor;/* Coupling factor (k) between both queues */
99 	u8	ecn_mask;	/* Mask to match packets into L-queue */
100 	u32	min_qlen_step;	/* Minimum queue length to apply step thresh */
101 	bool	drop_early;	/* Drop at enqueue (1) instead of dequeue  (0) */
102 	bool	drop_overload;	/* Drop (1) on overload, or overflow (0) */
103 	bool	split_gso;	/* Split aggregated skb (1) or leave as is (0) */
104 
105 	/* Statistics */
106 	u64	c_head_ts;	/* Enqueue timestamp of the C-queue head */
107 	u64	l_head_ts;	/* Enqueue timestamp of the L-queue head */
108 	u64	last_qdelay;	/* Q delay val at the last probability update */
109 	u32	packets_in_c;	/* Enqueue packet counter of the C-queue */
110 	u32	packets_in_l;	/* Enqueue packet counter of the L-queue */
111 	u32	maxq;		/* Maximum queue size of the C-queue */
112 	u32	ecn_mark;	/* ECN mark pkt counter due to PI probability */
113 	u32	step_marks;	/* ECN mark pkt counter due to step AQM */
114 	u32	memory_used;	/* Memory used of both queues */
115 	u32	max_memory_used;/* Maximum used memory */
116 
117 	/* Deferred drop statistics */
118 	u32	deferred_drops_cnt;	/* Packets dropped */
119 	u32	deferred_drops_len;	/* Bytes dropped */
120 };
121 
122 struct dualpi2_skb_cb {
123 	u64 ts;			/* Timestamp at enqueue */
124 	u8 apply_step:1,	/* Can we apply the step threshold */
125 	   classified:2,	/* Packet classification results */
126 	   ect:2;		/* Packet ECT codepoint */
127 };
128 
129 enum dualpi2_classification_results {
130 	DUALPI2_C_CLASSIC	= 0,	/* C-queue */
131 	DUALPI2_C_L4S		= 1,	/* L-queue (scale mark/classic drop) */
132 	DUALPI2_C_LLLL		= 2,	/* L-queue (no drops/marks) */
133 	__DUALPI2_C_MAX			/* Keep last*/
134 };
135 
136 static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb)
137 {
138 	qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb));
139 	return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data;
140 }
141 
142 static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference)
143 {
144 	return reference - dualpi2_skb_cb(skb)->ts;
145 }
146 
147 static u64 head_enqueue_time(struct Qdisc *q)
148 {
149 	struct sk_buff *skb = qdisc_peek_head(q);
150 
151 	return skb ? dualpi2_skb_cb(skb)->ts : 0;
152 }
153 
154 static u32 dualpi2_scale_alpha_beta(u32 param)
155 {
156 	u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
157 
158 	do_div(tmp, NSEC_PER_SEC);
159 	return tmp;
160 }
161 
162 static u32 dualpi2_unscale_alpha_beta(u32 param)
163 {
164 	u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING);
165 
166 	do_div(tmp, MAX_PROB);
167 	return tmp;
168 }
169 
170 static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
171 {
172 	return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
173 }
174 
175 static bool skb_is_l4s(struct sk_buff *skb)
176 {
177 	return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S;
178 }
179 
180 static bool skb_in_l_queue(struct sk_buff *skb)
181 {
182 	return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC;
183 }
184 
185 static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q)
186 {
187 	return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step;
188 }
189 
190 static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb)
191 {
192 	if (INET_ECN_set_ce(skb)) {
193 		WRITE_ONCE(q->ecn_mark, q->ecn_mark + 1);
194 		return true;
195 	}
196 	return false;
197 }
198 
199 static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
200 {
201 	WRITE_ONCE(q->c_protection_credit, q->c_protection_init);
202 }
203 
204 /* This computes the initial credit value and WRR weight for the L queue (wl)
205  * from the weight of the C queue (wc).
206  * If wl > wc, the scheduler will start with the L queue when reset.
207  */
208 static void dualpi2_calculate_c_protection(struct Qdisc *sch,
209 					   struct dualpi2_sched_data *q, u32 wc)
210 {
211 	q->c_protection_wc = wc;
212 	q->c_protection_wl = MAX_WC - wc;
213 	q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *
214 		((int)q->c_protection_wc - (int)q->c_protection_wl);
215 	dualpi2_reset_c_protection(q);
216 }
217 
218 static bool dualpi2_roll(u32 prob)
219 {
220 	return get_random_u32() <= prob;
221 }
222 
223 /* Packets in the C-queue are subject to a marking probability pC, which is the
224  * square of the internal PI probability (i.e., have an overall lower mark/drop
225  * probability). If the qdisc is overloaded, ignore ECT values and only drop.
226  *
227  * Note that this marking scheme is also applied to L4S packets during overload.
228  * Return true if packet dropping is required in C queue
229  */
230 static bool dualpi2_classic_marking(struct dualpi2_sched_data *q,
231 				    struct sk_buff *skb, u32 prob,
232 				    bool overload)
233 {
234 	if (dualpi2_roll(prob) && dualpi2_roll(prob)) {
235 		if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
236 			return true;
237 		dualpi2_mark(q, skb);
238 	}
239 	return false;
240 }
241 
242 /* Packets in the L-queue are subject to a marking probability pL given by the
243  * internal PI probability scaled by the coupling factor.
244  *
245  * On overload (i.e., @local_l_prob is >= 100%):
246  * - if the qdisc is configured to trade losses to preserve latency (i.e.,
247  *   @q->drop_overload), apply classic drops first before marking.
248  * - otherwise, preserve the "no loss" property of ECN at the cost of queueing
249  *   delay, eventually resulting in taildrop behavior once sch->limit is
250  *   reached.
251  * Return true if packet dropping is required in L queue
252  */
253 static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q,
254 				     struct sk_buff *skb,
255 				     u64 local_l_prob, u32 prob,
256 				     bool overload)
257 {
258 	if (overload) {
259 		/* Apply classic drop */
260 		if (!q->drop_overload ||
261 		    !(dualpi2_roll(prob) && dualpi2_roll(prob)))
262 			goto mark;
263 		return true;
264 	}
265 
266 	/* We can safely cut the upper 32b as overload==false */
267 	if (dualpi2_roll(local_l_prob)) {
268 		/* Non-ECT packets could have classified as L4S by filters. */
269 		if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
270 			return true;
271 mark:
272 		dualpi2_mark(q, skb);
273 	}
274 	return false;
275 }
276 
277 /* Decide whether a given packet must be dropped (or marked if ECT), according
278  * to the PI2 probability.
279  *
280  * Never mark/drop if we have a standing queue of less than 2 MTUs.
281  */
282 static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q,
283 		      struct sk_buff *skb)
284 {
285 	u64 local_l_prob;
286 	bool overload;
287 	u32 prob;
288 
289 	if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch)))
290 		return false;
291 
292 	prob = READ_ONCE(q->pi2_prob);
293 	local_l_prob = (u64)prob * q->coupling_factor;
294 	overload = local_l_prob > MAX_PROB;
295 
296 	switch (dualpi2_skb_cb(skb)->classified) {
297 	case DUALPI2_C_CLASSIC:
298 		return dualpi2_classic_marking(q, skb, prob, overload);
299 	case DUALPI2_C_L4S:
300 		return dualpi2_scalable_marking(q, skb, local_l_prob, prob,
301 						overload);
302 	default: /* DUALPI2_C_LLLL */
303 		return false;
304 	}
305 }
306 
307 static void dualpi2_read_ect(struct sk_buff *skb)
308 {
309 	struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
310 	int wlen = skb_network_offset(skb);
311 
312 	switch (skb_protocol(skb, true)) {
313 	case htons(ETH_P_IP):
314 		wlen += sizeof(struct iphdr);
315 		if (!pskb_may_pull(skb, wlen) ||
316 		    skb_try_make_writable(skb, wlen))
317 			goto not_ecn;
318 
319 		cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK;
320 		break;
321 	case htons(ETH_P_IPV6):
322 		wlen += sizeof(struct ipv6hdr);
323 		if (!pskb_may_pull(skb, wlen) ||
324 		    skb_try_make_writable(skb, wlen))
325 			goto not_ecn;
326 
327 		cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK;
328 		break;
329 	default:
330 		goto not_ecn;
331 	}
332 	return;
333 
334 not_ecn:
335 	/* Non pullable/writable packets can only be dropped hence are
336 	 * classified as not ECT.
337 	 */
338 	cb->ect = INET_ECN_NOT_ECT;
339 }
340 
341 static int dualpi2_skb_classify(struct dualpi2_sched_data *q,
342 				struct sk_buff *skb)
343 {
344 	struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
345 	struct tcf_result res;
346 	struct tcf_proto *fl;
347 	int result;
348 
349 	dualpi2_read_ect(skb);
350 	if (cb->ect & q->ecn_mask) {
351 		cb->classified = DUALPI2_C_L4S;
352 		return NET_XMIT_SUCCESS;
353 	}
354 
355 	if (TC_H_MAJ(skb->priority) == q->sch->handle &&
356 	    TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) {
357 		cb->classified = TC_H_MIN(skb->priority);
358 		return NET_XMIT_SUCCESS;
359 	}
360 
361 	fl = rcu_dereference_bh(q->tcf_filters);
362 	if (!fl) {
363 		cb->classified = DUALPI2_C_CLASSIC;
364 		return NET_XMIT_SUCCESS;
365 	}
366 
367 	result = tcf_classify(skb, NULL, fl, &res, false);
368 	if (result >= 0) {
369 #ifdef CONFIG_NET_CLS_ACT
370 		switch (result) {
371 		case TC_ACT_STOLEN:
372 		case TC_ACT_QUEUED:
373 		case TC_ACT_TRAP:
374 			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
375 		case TC_ACT_SHOT:
376 			return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
377 		}
378 #endif
379 		cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ?
380 			TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC;
381 	}
382 	return NET_XMIT_SUCCESS;
383 }
384 
385 static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch,
386 			       struct sk_buff **to_free)
387 {
388 	struct dualpi2_sched_data *q = qdisc_priv(sch);
389 	struct dualpi2_skb_cb *cb;
390 
391 	if (unlikely(qdisc_qlen(sch) >= sch->limit) ||
392 	    unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) {
393 		qdisc_qstats_overlimit(sch);
394 		if (skb_in_l_queue(skb))
395 			qdisc_qstats_overlimit(q->l_queue);
396 		return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT);
397 	}
398 
399 	if (q->drop_early && must_drop(sch, q, skb)) {
400 		qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_CONGESTED);
401 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
402 	}
403 
404 	cb = dualpi2_skb_cb(skb);
405 	cb->ts = ktime_get_ns();
406 	WRITE_ONCE(q->memory_used, q->memory_used + skb->truesize);
407 	if (q->memory_used > q->max_memory_used)
408 		WRITE_ONCE(q->max_memory_used, q->memory_used);
409 
410 	if (qdisc_qlen(sch) > q->maxq)
411 		WRITE_ONCE(q->maxq, qdisc_qlen(sch));
412 
413 	if (skb_in_l_queue(skb)) {
414 		/* Apply step thresh if skb is L4S && L-queue len >= min_qlen */
415 		dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q);
416 
417 		/* Keep the overall qdisc stats consistent */
418 		qdisc_qlen_inc(sch);
419 		qdisc_qstats_backlog_inc(sch, skb);
420 		WRITE_ONCE(q->packets_in_l, q->packets_in_l + 1);
421 		if (!q->l_head_ts)
422 			WRITE_ONCE(q->l_head_ts, cb->ts);
423 		return qdisc_enqueue_tail(skb, q->l_queue);
424 	}
425 	WRITE_ONCE(q->packets_in_c, q->packets_in_c + 1);
426 	if (!q->c_head_ts)
427 		WRITE_ONCE(q->c_head_ts, cb->ts);
428 	return qdisc_enqueue_tail(skb, sch);
429 }
430 
431 /* By default, dualpi2 will split GSO skbs into independent skbs and enqueue
432  * each of those individually. This yields the following benefits, at the
433  * expense of CPU usage:
434  * - Finer-grained AQM actions as the sub-packets of a burst no longer share the
435  *   same fate (e.g., the random mark/drop probability is applied individually)
436  * - Improved precision of the starvation protection/WRR scheduler at dequeue,
437  *   as the size of the dequeued packets will be smaller.
438  */
439 static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
440 				 struct sk_buff **to_free)
441 {
442 	struct dualpi2_sched_data *q = qdisc_priv(sch);
443 	int err;
444 
445 	err = dualpi2_skb_classify(q, skb);
446 	if (err != NET_XMIT_SUCCESS) {
447 		if (err & __NET_XMIT_BYPASS)
448 			qdisc_qstats_drop(sch);
449 		__qdisc_drop(skb, to_free);
450 		return err;
451 	}
452 
453 	if (q->split_gso && skb_is_gso(skb)) {
454 		netdev_features_t features;
455 		struct sk_buff *nskb, *next;
456 		int cnt, byte_len, orig_len;
457 		int err;
458 
459 		features = netif_skb_features(skb);
460 		nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
461 		if (IS_ERR_OR_NULL(nskb))
462 			return qdisc_drop(skb, sch, to_free);
463 
464 		cnt = 0;
465 		byte_len = 0;
466 		orig_len = qdisc_pkt_len(skb);
467 		skb_list_walk_safe(nskb, nskb, next) {
468 			skb_mark_not_on_list(nskb);
469 
470 			/* Iterate through GSO fragments of an skb:
471 			 * (1) Set pkt_len from the single GSO fragments
472 			 * (2) Copy classified and ect values of an skb
473 			 * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
474 			 */
475 			qdisc_skb_cb(nskb)->pkt_len = nskb->len;
476 			qdisc_skb_cb(nskb)->pkt_segs = 1;
477 			dualpi2_skb_cb(nskb)->classified =
478 				dualpi2_skb_cb(skb)->classified;
479 			dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
480 			err = dualpi2_enqueue_skb(nskb, sch, to_free);
481 
482 			if (err == NET_XMIT_SUCCESS) {
483 				/* Compute the backlog adjustment that needs
484 				 * to be propagated in the qdisc tree to reflect
485 				 * all new skbs successfully enqueued.
486 				 */
487 				++cnt;
488 				byte_len += nskb->len;
489 			}
490 		}
491 		if (cnt > 0) {
492 			/* The caller will add the original skb stats to its
493 			 * backlog, compensate this if any nskb is enqueued.
494 			 */
495 			qdisc_tree_reduce_backlog(sch, 1 - cnt,
496 						  orig_len - byte_len);
497 		}
498 		consume_skb(skb);
499 		return cnt > 0 ? NET_XMIT_SUCCESS : err;
500 	}
501 	return dualpi2_enqueue_skb(skb, sch, to_free);
502 }
503 
504 /* Select the queue from which the next packet can be dequeued, ensuring that
505  * neither queue can starve the other with a WRR scheduler.
506  *
507  * The sign of the WRR credit determines the next queue, while the size of
508  * the dequeued packet determines the magnitude of the WRR credit change. If
509  * either queue is empty, the WRR credit is kept unchanged.
510  *
511  * As the dequeued packet can be dropped later, the caller has to perform the
512  * qdisc_bstats_update() calls.
513  */
514 static struct sk_buff *dequeue_packet(struct Qdisc *sch,
515 				      struct dualpi2_sched_data *q,
516 				      int *credit_change,
517 				      u64 now)
518 {
519 	struct sk_buff *skb = NULL;
520 	int c_len;
521 
522 	*credit_change = 0;
523 	c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue);
524 	if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) {
525 		skb = __qdisc_dequeue_head(&q->l_queue->q);
526 		WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue));
527 		if (c_len)
528 			*credit_change = q->c_protection_wc;
529 		qdisc_qstats_backlog_dec(q->l_queue, skb);
530 
531 		/* Keep the global queue size consistent */
532 		qdisc_qlen_dec(sch);
533 	} else if (c_len) {
534 		skb = __qdisc_dequeue_head(&sch->q);
535 		WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch));
536 		if (qdisc_qlen(q->l_queue))
537 			*credit_change = ~((s32)q->c_protection_wl) + 1;
538 	} else {
539 		dualpi2_reset_c_protection(q);
540 		return NULL;
541 	}
542 	WRITE_ONCE(q->memory_used, q->memory_used - skb->truesize);
543 	*credit_change *= qdisc_pkt_len(skb);
544 	qdisc_qstats_backlog_dec(sch, skb);
545 	return skb;
546 }
547 
548 static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb,
549 		       u64 now)
550 {
551 	u64 qdelay = 0;
552 
553 	if (q->step_in_packets)
554 		qdelay = qdisc_qlen(q->l_queue);
555 	else
556 		qdelay = dualpi2_sojourn_time(skb, now);
557 
558 	if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) {
559 		if (!dualpi2_skb_cb(skb)->ect) {
560 			/* Drop this non-ECT packet */
561 			return 1;
562 		}
563 
564 		if (dualpi2_mark(q, skb))
565 			WRITE_ONCE(q->step_marks, q->step_marks + 1);
566 	}
567 	qdisc_bstats_update(q->l_queue, skb);
568 	return 0;
569 }
570 
571 static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb,
572 			   struct Qdisc *sch, enum qdisc_drop_reason reason)
573 {
574 	++q->deferred_drops_cnt;
575 	q->deferred_drops_len += qdisc_pkt_len(skb);
576 	qdisc_dequeue_drop(sch, skb, reason);
577 	qdisc_qstats_drop(sch);
578 }
579 
580 static struct sk_buff *__dualpi2_qdisc_dequeue(struct Qdisc *sch)
581 {
582 	struct dualpi2_sched_data *q = qdisc_priv(sch);
583 	struct sk_buff *skb;
584 	int credit_change;
585 	u64 now;
586 
587 	now = ktime_get_ns();
588 
589 	while ((skb = dequeue_packet(sch, q, &credit_change, now))) {
590 		if (!q->drop_early && must_drop(sch, q, skb)) {
591 			drop_and_retry(q, skb, sch, QDISC_DROP_CONGESTED);
592 			continue;
593 		}
594 
595 		if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) {
596 			qdisc_qstats_drop(q->l_queue);
597 			drop_and_retry(q, skb, sch, QDISC_DROP_L4S_STEP_NON_ECN);
598 			continue;
599 		}
600 
601 		WRITE_ONCE(q->c_protection_credit,
602 			   q->c_protection_credit + credit_change);
603 		qdisc_bstats_update(sch, skb);
604 		break;
605 	}
606 
607 	return skb;
608 }
609 
610 static void dualpi2_dequeue_drop(struct Qdisc *sch)
611 {
612 	struct dualpi2_sched_data *q = qdisc_priv(sch);
613 
614 	if (q->deferred_drops_cnt) {
615 		qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt,
616 					  q->deferred_drops_len);
617 		q->deferred_drops_cnt = 0;
618 		q->deferred_drops_len = 0;
619 	}
620 }
621 
622 static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
623 {
624 	struct sk_buff *skb;
625 
626 	skb = __dualpi2_qdisc_dequeue(sch);
627 
628 	dualpi2_dequeue_drop(sch);
629 
630 	return skb;
631 }
632 
633 static struct sk_buff *dualpi2_peek(struct Qdisc *sch)
634 {
635 	struct sk_buff *skb = skb_peek(&sch->gso_skb);
636 
637 	if (!skb) {
638 		skb = __dualpi2_qdisc_dequeue(sch);
639 
640 		if (skb) {
641 			__skb_queue_head(&sch->gso_skb, skb);
642 			/* it's still part of the queue */
643 			qdisc_qstats_backlog_inc(sch, skb);
644 			sch->q.qlen++;
645 		}
646 
647 		dualpi2_dequeue_drop(sch);
648 	}
649 
650 	return skb;
651 }
652 
653 static s64 __scale_delta(u64 diff)
654 {
655 	do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
656 	return diff;
657 }
658 
659 static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
660 			     u64 *qdelay_l)
661 {
662 	u64 now, qc, ql;
663 
664 	now = ktime_get_ns();
665 	qc = READ_ONCE(q->c_head_ts);
666 	ql = READ_ONCE(q->l_head_ts);
667 
668 	*qdelay_c = qc ? now - qc : 0;
669 	*qdelay_l = ql ? now - ql : 0;
670 }
671 
672 static u32 calculate_probability(struct Qdisc *sch)
673 {
674 	struct dualpi2_sched_data *q = qdisc_priv(sch);
675 	u32 new_prob;
676 	u64 qdelay_c;
677 	u64 qdelay_l;
678 	u64 qdelay;
679 	s64 delta;
680 
681 	get_queue_delays(q, &qdelay_c, &qdelay_l);
682 	qdelay = max(qdelay_l, qdelay_c);
683 
684 	/* Alpha and beta take at most 32b, i.e, the delay difference would
685 	 * overflow for queuing delay differences > ~4.2sec.
686 	 */
687 	delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha;
688 	delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta;
689 	q->last_qdelay = qdelay;
690 
691 	/* Bound new_prob between 0 and MAX_PROB */
692 	if (delta > 0) {
693 		new_prob = __scale_delta(delta) + q->pi2_prob;
694 		if (new_prob < q->pi2_prob)
695 			new_prob = MAX_PROB;
696 	} else {
697 		new_prob = q->pi2_prob - __scale_delta(~delta + 1);
698 		if (new_prob > q->pi2_prob)
699 			new_prob = 0;
700 	}
701 
702 	/* If we do not drop on overload, ensure we cap the L4S probability to
703 	 * 100% to keep window fairness when overflowing.
704 	 */
705 	if (!q->drop_overload)
706 		return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);
707 	return new_prob;
708 }
709 
710 static u32 get_memory_limit(struct Qdisc *sch, u32 limit)
711 {
712 	/* Apply rule of thumb, i.e., doubling the packet length,
713 	 * to further include per packet overhead in memory_limit.
714 	 */
715 	u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));
716 
717 	if (upper_32_bits(memlim))
718 		return U32_MAX;
719 	else
720 		return lower_32_bits(memlim);
721 }
722 
723 static u32 convert_us_to_nsec(u32 us)
724 {
725 	u64 ns = mul_u32_u32(us, NSEC_PER_USEC);
726 
727 	if (upper_32_bits(ns))
728 		return U32_MAX;
729 
730 	return lower_32_bits(ns);
731 }
732 
733 static u32 convert_ns_to_usec(u64 ns)
734 {
735 	do_div(ns, NSEC_PER_USEC);
736 	if (upper_32_bits(ns))
737 		return U32_MAX;
738 
739 	return lower_32_bits(ns);
740 }
741 
742 static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
743 {
744 	struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer);
745 	struct Qdisc *sch = q->sch;
746 	spinlock_t *root_lock; /* to lock qdisc for probability calculations */
747 
748 	rcu_read_lock();
749 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
750 	spin_lock(root_lock);
751 
752 	WRITE_ONCE(q->pi2_prob, calculate_probability(sch));
753 	hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
754 
755 	spin_unlock(root_lock);
756 	rcu_read_unlock();
757 	return HRTIMER_RESTART;
758 }
759 
760 static struct netlink_range_validation dualpi2_alpha_beta_range = {
761 	.min = 1,
762 	.max = ALPHA_BETA_MAX,
763 };
764 
765 static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {
766 	[TCA_DUALPI2_LIMIT]		= NLA_POLICY_MIN(NLA_U32, 1),
767 	[TCA_DUALPI2_MEMORY_LIMIT]	= NLA_POLICY_MIN(NLA_U32, 1),
768 	[TCA_DUALPI2_TARGET]		= { .type = NLA_U32 },
769 	[TCA_DUALPI2_TUPDATE]		= NLA_POLICY_MIN(NLA_U32, 1),
770 	[TCA_DUALPI2_ALPHA]		=
771 		NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
772 	[TCA_DUALPI2_BETA]		=
773 		NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
774 	[TCA_DUALPI2_STEP_THRESH_PKTS]	= { .type = NLA_U32 },
775 	[TCA_DUALPI2_STEP_THRESH_US]	= { .type = NLA_U32 },
776 	[TCA_DUALPI2_MIN_QLEN_STEP]	= { .type = NLA_U32 },
777 	[TCA_DUALPI2_COUPLING]		= NLA_POLICY_MIN(NLA_U8, 1),
778 	[TCA_DUALPI2_DROP_OVERLOAD]	=
779 		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX),
780 	[TCA_DUALPI2_DROP_EARLY]	=
781 		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX),
782 	[TCA_DUALPI2_C_PROTECTION]	=
783 		NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC),
784 	[TCA_DUALPI2_ECN_MASK]		=
785 		NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT,
786 				 TCA_DUALPI2_ECN_MASK_MAX),
787 	[TCA_DUALPI2_SPLIT_GSO]		=
788 		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX),
789 };
790 
791 static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
792 			  struct netlink_ext_ack *extack)
793 {
794 	struct nlattr *tb[TCA_DUALPI2_MAX + 1];
795 	struct dualpi2_sched_data *q;
796 	int old_backlog;
797 	int old_qlen;
798 	int err;
799 
800 	if (!opt || !nla_len(opt)) {
801 		NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required");
802 		return -EINVAL;
803 	}
804 	err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,
805 			       extack);
806 	if (err < 0)
807 		return err;
808 	if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) {
809 		NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes");
810 		return -EINVAL;
811 	}
812 
813 	q = qdisc_priv(sch);
814 	sch_tree_lock(sch);
815 
816 	if (tb[TCA_DUALPI2_LIMIT]) {
817 		u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
818 
819 		WRITE_ONCE(sch->limit, limit);
820 		WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));
821 	}
822 
823 	if (tb[TCA_DUALPI2_MEMORY_LIMIT])
824 		WRITE_ONCE(q->memory_limit,
825 			   nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));
826 
827 	if (tb[TCA_DUALPI2_TARGET]) {
828 		u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
829 
830 		WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);
831 	}
832 
833 	if (tb[TCA_DUALPI2_TUPDATE]) {
834 		u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
835 
836 		WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate));
837 	}
838 
839 	if (tb[TCA_DUALPI2_ALPHA]) {
840 		u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
841 
842 		WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));
843 	}
844 
845 	if (tb[TCA_DUALPI2_BETA]) {
846 		u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
847 
848 		WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));
849 	}
850 
851 	if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) {
852 		u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]);
853 
854 		WRITE_ONCE(q->step_in_packets, true);
855 		WRITE_ONCE(q->step_thresh, step_th);
856 	} else if (tb[TCA_DUALPI2_STEP_THRESH_US]) {
857 		u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]);
858 
859 		WRITE_ONCE(q->step_in_packets, false);
860 		WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th));
861 	}
862 
863 	if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
864 		WRITE_ONCE(q->min_qlen_step,
865 			   nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));
866 
867 	if (tb[TCA_DUALPI2_COUPLING]) {
868 		u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
869 
870 		WRITE_ONCE(q->coupling_factor, coupling);
871 	}
872 
873 	if (tb[TCA_DUALPI2_DROP_OVERLOAD]) {
874 		u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]);
875 
876 		WRITE_ONCE(q->drop_overload, (bool)drop_overload);
877 	}
878 
879 	if (tb[TCA_DUALPI2_DROP_EARLY]) {
880 		u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]);
881 
882 		WRITE_ONCE(q->drop_early, (bool)drop_early);
883 	}
884 
885 	if (tb[TCA_DUALPI2_C_PROTECTION]) {
886 		u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);
887 
888 		dualpi2_calculate_c_protection(sch, q, wc);
889 	}
890 
891 	if (tb[TCA_DUALPI2_ECN_MASK]) {
892 		u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]);
893 
894 		WRITE_ONCE(q->ecn_mask, ecn_mask);
895 	}
896 
897 	if (tb[TCA_DUALPI2_SPLIT_GSO]) {
898 		u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]);
899 
900 		WRITE_ONCE(q->split_gso, (bool)split_gso);
901 	}
902 
903 	old_qlen = qdisc_qlen(sch);
904 	old_backlog = sch->qstats.backlog;
905 	while (qdisc_qlen(sch) > sch->limit ||
906 	       q->memory_used > q->memory_limit) {
907 		struct sk_buff *skb = NULL;
908 
909 		if (qdisc_qlen(sch) > qdisc_qlen(q->l_queue)) {
910 			skb = qdisc_dequeue_internal(sch, true);
911 			if (unlikely(!skb)) {
912 				WARN_ON_ONCE(1);
913 				break;
914 			}
915 			WRITE_ONCE(q->memory_used, q->memory_used - skb->truesize);
916 			rtnl_qdisc_drop(skb, sch);
917 		} else if (qdisc_qlen(q->l_queue)) {
918 			skb = qdisc_dequeue_internal(q->l_queue, true);
919 			if (unlikely(!skb)) {
920 				WARN_ON_ONCE(1);
921 				break;
922 			}
923 			/* L-queue packets are counted in both sch and
924 			 * l_queue on enqueue; qdisc_dequeue_internal()
925 			 * handled l_queue, so we further account for sch.
926 			 */
927 			qdisc_qlen_dec(sch);
928 			qdisc_qstats_backlog_dec(sch, skb);
929 			WRITE_ONCE(q->memory_used, q->memory_used - skb->truesize);
930 			rtnl_qdisc_drop(skb, q->l_queue);
931 			qdisc_qstats_drop(sch);
932 		} else {
933 			WARN_ON_ONCE(1);
934 			break;
935 		}
936 	}
937 	qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
938 				  old_backlog - sch->qstats.backlog);
939 
940 	sch_tree_unlock(sch);
941 	return 0;
942 }
943 
944 /* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */
945 static void dualpi2_reset_default(struct Qdisc *sch)
946 {
947 	struct dualpi2_sched_data *q = qdisc_priv(sch);
948 
949 	q->sch->limit = 10000;				/* Max 125ms at 1Gbps */
950 	q->memory_limit = get_memory_limit(sch, q->sch->limit);
951 
952 	q->pi2_target = 15 * NSEC_PER_MSEC;
953 	q->pi2_tupdate = 16 * NSEC_PER_MSEC;
954 	q->pi2_alpha = dualpi2_scale_alpha_beta(41);	/* ~0.16 Hz * 256 */
955 	q->pi2_beta = dualpi2_scale_alpha_beta(819);	/* ~3.20 Hz * 256 */
956 
957 	q->step_thresh = 1 * NSEC_PER_MSEC;
958 	q->step_in_packets = false;
959 
960 	dualpi2_calculate_c_protection(q->sch, q, 10);	/* wc=10%, wl=90% */
961 
962 	q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT;	/* INET_ECN_ECT_1 */
963 	q->min_qlen_step = 0;		/* Always apply step mark in L-queue */
964 	q->coupling_factor = 2;		/* window fairness for equal RTTs */
965 	q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */
966 	q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */
967 	q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO;	/* Split GSO */
968 }
969 
970 static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
971 			struct netlink_ext_ack *extack)
972 {
973 	struct dualpi2_sched_data *q = qdisc_priv(sch);
974 	int err;
975 
976 	sch->flags |= TCQ_F_DEQUEUE_DROPS;
977 	hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC,
978 		      HRTIMER_MODE_ABS_PINNED_SOFT);
979 
980 	q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
981 				       TC_H_MAKE(sch->handle, 1), extack);
982 	if (!q->l_queue)
983 		return -ENOMEM;
984 
985 	err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack);
986 	if (err)
987 		return err;
988 
989 	q->sch = sch;
990 	dualpi2_reset_default(sch);
991 
992 	if (opt && nla_len(opt)) {
993 		err = dualpi2_change(sch, opt, extack);
994 
995 		if (err)
996 			return err;
997 	}
998 
999 	hrtimer_start(&q->pi2_timer, next_pi2_timeout(q),
1000 		      HRTIMER_MODE_ABS_PINNED_SOFT);
1001 	return 0;
1002 }
1003 
1004 static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb)
1005 {
1006 	struct dualpi2_sched_data *q = qdisc_priv(sch);
1007 	struct nlattr *opts;
1008 	bool step_in_pkts;
1009 	u32 step_th;
1010 
1011 	step_in_pkts = READ_ONCE(q->step_in_packets);
1012 	step_th = READ_ONCE(q->step_thresh);
1013 
1014 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
1015 	if (!opts)
1016 		goto nla_put_failure;
1017 
1018 	if (step_in_pkts &&
1019 	    (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
1020 	    nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
1021 			READ_ONCE(q->memory_limit)) ||
1022 	    nla_put_u32(skb, TCA_DUALPI2_TARGET,
1023 			convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
1024 	    nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
1025 			convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
1026 	    nla_put_u32(skb, TCA_DUALPI2_ALPHA,
1027 			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
1028 	    nla_put_u32(skb, TCA_DUALPI2_BETA,
1029 			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
1030 	    nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) ||
1031 	    nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
1032 			READ_ONCE(q->min_qlen_step)) ||
1033 	    nla_put_u8(skb, TCA_DUALPI2_COUPLING,
1034 		       READ_ONCE(q->coupling_factor)) ||
1035 	    nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
1036 		       READ_ONCE(q->drop_overload)) ||
1037 	    nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
1038 		       READ_ONCE(q->drop_early)) ||
1039 	    nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
1040 		       READ_ONCE(q->c_protection_wc)) ||
1041 	    nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
1042 	    nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
1043 		goto nla_put_failure;
1044 
1045 	if (!step_in_pkts &&
1046 	    (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
1047 	    nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
1048 			READ_ONCE(q->memory_limit)) ||
1049 	    nla_put_u32(skb, TCA_DUALPI2_TARGET,
1050 			convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
1051 	    nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
1052 			convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
1053 	    nla_put_u32(skb, TCA_DUALPI2_ALPHA,
1054 			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
1055 	    nla_put_u32(skb, TCA_DUALPI2_BETA,
1056 			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
1057 	    nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US,
1058 			convert_ns_to_usec(step_th)) ||
1059 	    nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
1060 			READ_ONCE(q->min_qlen_step)) ||
1061 	    nla_put_u8(skb, TCA_DUALPI2_COUPLING,
1062 		       READ_ONCE(q->coupling_factor)) ||
1063 	    nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
1064 		       READ_ONCE(q->drop_overload)) ||
1065 	    nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
1066 		       READ_ONCE(q->drop_early)) ||
1067 	    nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
1068 		       READ_ONCE(q->c_protection_wc)) ||
1069 	    nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
1070 	    nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
1071 		goto nla_put_failure;
1072 
1073 	return nla_nest_end(skb, opts);
1074 
1075 nla_put_failure:
1076 	nla_nest_cancel(skb, opts);
1077 	return -1;
1078 }
1079 
1080 static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1081 {
1082 	struct dualpi2_sched_data *q = qdisc_priv(sch);
1083 	struct tc_dualpi2_xstats st = {
1084 		.prob			= READ_ONCE(q->pi2_prob),
1085 		.packets_in_c		= READ_ONCE(q->packets_in_c),
1086 		.packets_in_l		= READ_ONCE(q->packets_in_l),
1087 		.maxq			= READ_ONCE(q->maxq),
1088 		.ecn_mark		= READ_ONCE(q->ecn_mark),
1089 		.credit			= READ_ONCE(q->c_protection_credit),
1090 		.step_marks		= READ_ONCE(q->step_marks),
1091 		.memory_used		= READ_ONCE(q->memory_used),
1092 		.max_memory_used	= READ_ONCE(q->max_memory_used),
1093 		.memory_limit		= READ_ONCE(q->memory_limit),
1094 	};
1095 	u64 qc, ql;
1096 
1097 	get_queue_delays(q, &qc, &ql);
1098 	st.delay_l = convert_ns_to_usec(ql);
1099 	st.delay_c = convert_ns_to_usec(qc);
1100 	return gnet_stats_copy_app(d, &st, sizeof(st));
1101 }
1102 
1103 /* Reset both L-queue and C-queue, internal packet counters, PI probability,
1104  * C-queue protection credit, and timestamps, while preserving current
1105  * configuration of DUALPI2.
1106  */
1107 static void dualpi2_reset(struct Qdisc *sch)
1108 {
1109 	struct dualpi2_sched_data *q = qdisc_priv(sch);
1110 
1111 	qdisc_reset_queue(sch);
1112 	qdisc_reset_queue(q->l_queue);
1113 	WRITE_ONCE(q->c_head_ts, 0);
1114 	WRITE_ONCE(q->l_head_ts, 0);
1115 	WRITE_ONCE(q->pi2_prob, 0);
1116 	WRITE_ONCE(q->packets_in_c, 0);
1117 	WRITE_ONCE(q->packets_in_l, 0);
1118 	WRITE_ONCE(q->maxq, 0);
1119 	WRITE_ONCE(q->ecn_mark, 0);
1120 	WRITE_ONCE(q->step_marks, 0);
1121 	WRITE_ONCE(q->memory_used, 0);
1122 	WRITE_ONCE(q->max_memory_used, 0);
1123 	dualpi2_reset_c_protection(q);
1124 }
1125 
1126 static void dualpi2_destroy(struct Qdisc *sch)
1127 {
1128 	struct dualpi2_sched_data *q = qdisc_priv(sch);
1129 
1130 	q->pi2_tupdate = 0;
1131 	hrtimer_cancel(&q->pi2_timer);
1132 	if (q->l_queue)
1133 		qdisc_put(q->l_queue);
1134 	tcf_block_put(q->tcf_block);
1135 }
1136 
1137 static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg)
1138 {
1139 	return NULL;
1140 }
1141 
1142 static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid)
1143 {
1144 	return 0;
1145 }
1146 
1147 static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent,
1148 				  u32 classid)
1149 {
1150 	return 0;
1151 }
1152 
1153 static void dualpi2_unbind(struct Qdisc *q, unsigned long cl)
1154 {
1155 }
1156 
1157 static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl,
1158 					   struct netlink_ext_ack *extack)
1159 {
1160 	struct dualpi2_sched_data *q = qdisc_priv(sch);
1161 
1162 	if (cl)
1163 		return NULL;
1164 	return q->tcf_block;
1165 }
1166 
1167 static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1168 {
1169 	unsigned int i;
1170 
1171 	if (arg->stop)
1172 		return;
1173 
1174 	/* We statically define only 2 queues */
1175 	for (i = 0; i < 2; i++) {
1176 		if (arg->count < arg->skip) {
1177 			arg->count++;
1178 			continue;
1179 		}
1180 		if (arg->fn(sch, i + 1, arg) < 0) {
1181 			arg->stop = 1;
1182 			break;
1183 		}
1184 		arg->count++;
1185 	}
1186 }
1187 
1188 /* Minimal class support to handle tc filters */
1189 static const struct Qdisc_class_ops dualpi2_class_ops = {
1190 	.leaf		= dualpi2_leaf,
1191 	.find		= dualpi2_find,
1192 	.tcf_block	= dualpi2_tcf_block,
1193 	.bind_tcf	= dualpi2_bind,
1194 	.unbind_tcf	= dualpi2_unbind,
1195 	.walk		= dualpi2_walk,
1196 };
1197 
1198 static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
1199 	.id		= "dualpi2",
1200 	.cl_ops		= &dualpi2_class_ops,
1201 	.priv_size	= sizeof(struct dualpi2_sched_data),
1202 	.enqueue	= dualpi2_qdisc_enqueue,
1203 	.dequeue	= dualpi2_qdisc_dequeue,
1204 	.peek		= dualpi2_peek,
1205 	.init		= dualpi2_init,
1206 	.destroy	= dualpi2_destroy,
1207 	.reset		= dualpi2_reset,
1208 	.change		= dualpi2_change,
1209 	.dump		= dualpi2_dump,
1210 	.dump_stats	= dualpi2_dump_stats,
1211 	.owner		= THIS_MODULE,
1212 };
1213 MODULE_ALIAS_NET_SCH("dualpi2");
1214 
1215 static int __init dualpi2_module_init(void)
1216 {
1217 	return register_qdisc(&dualpi2_qdisc_ops);
1218 }
1219 
1220 static void __exit dualpi2_module_exit(void)
1221 {
1222 	unregister_qdisc(&dualpi2_qdisc_ops);
1223 }
1224 
1225 module_init(dualpi2_module_init);
1226 module_exit(dualpi2_module_exit);
1227 
1228 MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler");
1229 MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>");
1230 MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>");
1231 MODULE_AUTHOR("Olga Albisser <olga@albisser.org>");
1232 MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>");
1233 MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>");
1234 
1235 MODULE_LICENSE("Dual BSD/GPL");
1236 MODULE_VERSION("1.0");
1237