1 // SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause 2 /* Copyright (C) 2024 Nokia 3 * 4 * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com> 5 * Author: Olga Albisser <olga@albisser.org> 6 * Author: Henrik Steen <henrist@henrist.net> 7 * Author: Olivier Tilmans <olivier.tilmans@nokia.com> 8 * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> 9 * 10 * DualPI Improved with a Square (dualpi2): 11 * - Supports congestion controls that comply with the Prague requirements 12 * in RFC9331 (e.g. TCP-Prague) 13 * - Supports coupled dual-queue with PI2 as defined in RFC9332 14 * - Supports ECN L4S-identifier (IP.ECN==0b*1) 15 * 16 * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks, 17 * they do not meet the 'Prague L4S Requirements' listed in RFC 9331 18 * Section 4, so they can only be used with DualPI2 in a datacenter 19 * context. 20 * 21 * References: 22 * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332 23 * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and 24 * scalable TCP." in proc. ACM CoNEXT'16, 2016. 25 */ 26 27 #include <linux/errno.h> 28 #include <linux/hrtimer.h> 29 #include <linux/if_vlan.h> 30 #include <linux/kernel.h> 31 #include <linux/limits.h> 32 #include <linux/module.h> 33 #include <linux/skbuff.h> 34 #include <linux/types.h> 35 36 #include <net/gso.h> 37 #include <net/inet_ecn.h> 38 #include <net/pkt_cls.h> 39 #include <net/pkt_sched.h> 40 41 /* 32b enable to support flows with windows up to ~8.6 * 1e9 packets 42 * i.e., twice the maximal snd_cwnd. 43 * MAX_PROB must be consistent with the RNG in dualpi2_roll(). 44 */ 45 #define MAX_PROB U32_MAX 46 47 /* alpha/beta values exchanged over netlink are in units of 256ns */ 48 #define ALPHA_BETA_SHIFT 8 49 50 /* Scaled values of alpha/beta must fit in 32b to avoid overflow in later 51 * computations. Consequently (see and dualpi2_scale_alpha_beta()), their 52 * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1 53 * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to 54 * control flows whose maximal RTTs can be in usec up to few secs. 55 */ 56 #define ALPHA_BETA_MAX ((1U << 31) - 1) 57 58 /* Internal alpha/beta are in units of 64ns. 59 * This enables to use all alpha/beta values in the allowed range without loss 60 * of precision due to rounding when scaling them internally, e.g., 61 * scale_alpha_beta(1) will not round down to 0. 62 */ 63 #define ALPHA_BETA_GRANULARITY 6 64 65 #define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY) 66 67 /* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */ 68 #define MAX_WC 100 69 70 struct dualpi2_sched_data { 71 struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */ 72 struct Qdisc *sch; /* The Classic queue (C-queue) */ 73 74 /* Registered tc filters */ 75 struct tcf_proto __rcu *tcf_filters; 76 struct tcf_block *tcf_block; 77 78 /* PI2 parameters */ 79 u64 pi2_target; /* Target delay in nanoseconds */ 80 u32 pi2_tupdate; /* Timer frequency in nanoseconds */ 81 u32 pi2_prob; /* Base PI probability */ 82 u32 pi2_alpha; /* Gain factor for the integral rate response */ 83 u32 pi2_beta; /* Gain factor for the proportional response */ 84 struct hrtimer pi2_timer; /* prob update timer */ 85 86 /* Step AQM (L-queue only) parameters */ 87 u32 step_thresh; /* Step threshold */ 88 bool step_in_packets; /* Step thresh in packets (1) or time (0) */ 89 90 /* C-queue starvation protection */ 91 s32 c_protection_credit; /* Credit (sign indicates which queue) */ 92 s32 c_protection_init; /* Reset value of the credit */ 93 u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */ 94 u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */ 95 96 /* General dualQ parameters */ 97 u32 memory_limit; /* Memory limit of both queues */ 98 u8 coupling_factor;/* Coupling factor (k) between both queues */ 99 u8 ecn_mask; /* Mask to match packets into L-queue */ 100 u32 min_qlen_step; /* Minimum queue length to apply step thresh */ 101 bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */ 102 bool drop_overload; /* Drop (1) on overload, or overflow (0) */ 103 bool split_gso; /* Split aggregated skb (1) or leave as is (0) */ 104 105 /* Statistics */ 106 u64 c_head_ts; /* Enqueue timestamp of the C-queue head */ 107 u64 l_head_ts; /* Enqueue timestamp of the L-queue head */ 108 u64 last_qdelay; /* Q delay val at the last probability update */ 109 u32 packets_in_c; /* Enqueue packet counter of the C-queue */ 110 u32 packets_in_l; /* Enqueue packet counter of the L-queue */ 111 u32 maxq; /* Maximum queue size of the C-queue */ 112 u32 ecn_mark; /* ECN mark pkt counter due to PI probability */ 113 u32 step_marks; /* ECN mark pkt counter due to step AQM */ 114 u32 memory_used; /* Memory used of both queues */ 115 u32 max_memory_used;/* Maximum used memory */ 116 117 /* Deferred drop statistics */ 118 u32 deferred_drops_cnt; /* Packets dropped */ 119 u32 deferred_drops_len; /* Bytes dropped */ 120 }; 121 122 struct dualpi2_skb_cb { 123 u64 ts; /* Timestamp at enqueue */ 124 u8 apply_step:1, /* Can we apply the step threshold */ 125 classified:2, /* Packet classification results */ 126 ect:2; /* Packet ECT codepoint */ 127 }; 128 129 enum dualpi2_classification_results { 130 DUALPI2_C_CLASSIC = 0, /* C-queue */ 131 DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */ 132 DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */ 133 __DUALPI2_C_MAX /* Keep last*/ 134 }; 135 136 static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb) 137 { 138 qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb)); 139 return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data; 140 } 141 142 static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference) 143 { 144 return reference - dualpi2_skb_cb(skb)->ts; 145 } 146 147 static u64 head_enqueue_time(struct Qdisc *q) 148 { 149 struct sk_buff *skb = qdisc_peek_head(q); 150 151 return skb ? dualpi2_skb_cb(skb)->ts : 0; 152 } 153 154 static u32 dualpi2_scale_alpha_beta(u32 param) 155 { 156 u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); 157 158 do_div(tmp, NSEC_PER_SEC); 159 return tmp; 160 } 161 162 static u32 dualpi2_unscale_alpha_beta(u32 param) 163 { 164 u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING); 165 166 do_div(tmp, MAX_PROB); 167 return tmp; 168 } 169 170 static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) 171 { 172 return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); 173 } 174 175 static bool skb_is_l4s(struct sk_buff *skb) 176 { 177 return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S; 178 } 179 180 static bool skb_in_l_queue(struct sk_buff *skb) 181 { 182 return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC; 183 } 184 185 static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q) 186 { 187 return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step; 188 } 189 190 static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb) 191 { 192 if (INET_ECN_set_ce(skb)) { 193 WRITE_ONCE(q->ecn_mark, q->ecn_mark + 1); 194 return true; 195 } 196 return false; 197 } 198 199 static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) 200 { 201 WRITE_ONCE(q->c_protection_credit, q->c_protection_init); 202 } 203 204 /* This computes the initial credit value and WRR weight for the L queue (wl) 205 * from the weight of the C queue (wc). 206 * If wl > wc, the scheduler will start with the L queue when reset. 207 */ 208 static void dualpi2_calculate_c_protection(struct Qdisc *sch, 209 struct dualpi2_sched_data *q, u32 wc) 210 { 211 q->c_protection_wc = wc; 212 q->c_protection_wl = MAX_WC - wc; 213 q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) * 214 ((int)q->c_protection_wc - (int)q->c_protection_wl); 215 dualpi2_reset_c_protection(q); 216 } 217 218 static bool dualpi2_roll(u32 prob) 219 { 220 return get_random_u32() <= prob; 221 } 222 223 /* Packets in the C-queue are subject to a marking probability pC, which is the 224 * square of the internal PI probability (i.e., have an overall lower mark/drop 225 * probability). If the qdisc is overloaded, ignore ECT values and only drop. 226 * 227 * Note that this marking scheme is also applied to L4S packets during overload. 228 * Return true if packet dropping is required in C queue 229 */ 230 static bool dualpi2_classic_marking(struct dualpi2_sched_data *q, 231 struct sk_buff *skb, u32 prob, 232 bool overload) 233 { 234 if (dualpi2_roll(prob) && dualpi2_roll(prob)) { 235 if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) 236 return true; 237 dualpi2_mark(q, skb); 238 } 239 return false; 240 } 241 242 /* Packets in the L-queue are subject to a marking probability pL given by the 243 * internal PI probability scaled by the coupling factor. 244 * 245 * On overload (i.e., @local_l_prob is >= 100%): 246 * - if the qdisc is configured to trade losses to preserve latency (i.e., 247 * @q->drop_overload), apply classic drops first before marking. 248 * - otherwise, preserve the "no loss" property of ECN at the cost of queueing 249 * delay, eventually resulting in taildrop behavior once sch->limit is 250 * reached. 251 * Return true if packet dropping is required in L queue 252 */ 253 static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q, 254 struct sk_buff *skb, 255 u64 local_l_prob, u32 prob, 256 bool overload) 257 { 258 if (overload) { 259 /* Apply classic drop */ 260 if (!q->drop_overload || 261 !(dualpi2_roll(prob) && dualpi2_roll(prob))) 262 goto mark; 263 return true; 264 } 265 266 /* We can safely cut the upper 32b as overload==false */ 267 if (dualpi2_roll(local_l_prob)) { 268 /* Non-ECT packets could have classified as L4S by filters. */ 269 if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) 270 return true; 271 mark: 272 dualpi2_mark(q, skb); 273 } 274 return false; 275 } 276 277 /* Decide whether a given packet must be dropped (or marked if ECT), according 278 * to the PI2 probability. 279 * 280 * Never mark/drop if we have a standing queue of less than 2 MTUs. 281 */ 282 static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q, 283 struct sk_buff *skb) 284 { 285 u64 local_l_prob; 286 bool overload; 287 u32 prob; 288 289 if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch))) 290 return false; 291 292 prob = READ_ONCE(q->pi2_prob); 293 local_l_prob = (u64)prob * q->coupling_factor; 294 overload = local_l_prob > MAX_PROB; 295 296 switch (dualpi2_skb_cb(skb)->classified) { 297 case DUALPI2_C_CLASSIC: 298 return dualpi2_classic_marking(q, skb, prob, overload); 299 case DUALPI2_C_L4S: 300 return dualpi2_scalable_marking(q, skb, local_l_prob, prob, 301 overload); 302 default: /* DUALPI2_C_LLLL */ 303 return false; 304 } 305 } 306 307 static void dualpi2_read_ect(struct sk_buff *skb) 308 { 309 struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); 310 int wlen = skb_network_offset(skb); 311 312 switch (skb_protocol(skb, true)) { 313 case htons(ETH_P_IP): 314 wlen += sizeof(struct iphdr); 315 if (!pskb_may_pull(skb, wlen) || 316 skb_try_make_writable(skb, wlen)) 317 goto not_ecn; 318 319 cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK; 320 break; 321 case htons(ETH_P_IPV6): 322 wlen += sizeof(struct ipv6hdr); 323 if (!pskb_may_pull(skb, wlen) || 324 skb_try_make_writable(skb, wlen)) 325 goto not_ecn; 326 327 cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK; 328 break; 329 default: 330 goto not_ecn; 331 } 332 return; 333 334 not_ecn: 335 /* Non pullable/writable packets can only be dropped hence are 336 * classified as not ECT. 337 */ 338 cb->ect = INET_ECN_NOT_ECT; 339 } 340 341 static int dualpi2_skb_classify(struct dualpi2_sched_data *q, 342 struct sk_buff *skb) 343 { 344 struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); 345 struct tcf_result res; 346 struct tcf_proto *fl; 347 int result; 348 349 dualpi2_read_ect(skb); 350 if (cb->ect & q->ecn_mask) { 351 cb->classified = DUALPI2_C_L4S; 352 return NET_XMIT_SUCCESS; 353 } 354 355 if (TC_H_MAJ(skb->priority) == q->sch->handle && 356 TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) { 357 cb->classified = TC_H_MIN(skb->priority); 358 return NET_XMIT_SUCCESS; 359 } 360 361 fl = rcu_dereference_bh(q->tcf_filters); 362 if (!fl) { 363 cb->classified = DUALPI2_C_CLASSIC; 364 return NET_XMIT_SUCCESS; 365 } 366 367 result = tcf_classify(skb, NULL, fl, &res, false); 368 if (result >= 0) { 369 #ifdef CONFIG_NET_CLS_ACT 370 switch (result) { 371 case TC_ACT_STOLEN: 372 case TC_ACT_QUEUED: 373 case TC_ACT_TRAP: 374 return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 375 case TC_ACT_SHOT: 376 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 377 } 378 #endif 379 cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ? 380 TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC; 381 } 382 return NET_XMIT_SUCCESS; 383 } 384 385 static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch, 386 struct sk_buff **to_free) 387 { 388 struct dualpi2_sched_data *q = qdisc_priv(sch); 389 struct dualpi2_skb_cb *cb; 390 391 if (unlikely(qdisc_qlen(sch) >= sch->limit) || 392 unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) { 393 qdisc_qstats_overlimit(sch); 394 if (skb_in_l_queue(skb)) 395 qdisc_qstats_overlimit(q->l_queue); 396 return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); 397 } 398 399 if (q->drop_early && must_drop(sch, q, skb)) { 400 qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_CONGESTED); 401 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 402 } 403 404 cb = dualpi2_skb_cb(skb); 405 cb->ts = ktime_get_ns(); 406 WRITE_ONCE(q->memory_used, q->memory_used + skb->truesize); 407 if (q->memory_used > q->max_memory_used) 408 WRITE_ONCE(q->max_memory_used, q->memory_used); 409 410 if (qdisc_qlen(sch) > q->maxq) 411 WRITE_ONCE(q->maxq, qdisc_qlen(sch)); 412 413 if (skb_in_l_queue(skb)) { 414 /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */ 415 dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q); 416 417 /* Keep the overall qdisc stats consistent */ 418 qdisc_qlen_inc(sch); 419 qdisc_qstats_backlog_inc(sch, skb); 420 WRITE_ONCE(q->packets_in_l, q->packets_in_l + 1); 421 if (!q->l_head_ts) 422 WRITE_ONCE(q->l_head_ts, cb->ts); 423 return qdisc_enqueue_tail(skb, q->l_queue); 424 } 425 WRITE_ONCE(q->packets_in_c, q->packets_in_c + 1); 426 if (!q->c_head_ts) 427 WRITE_ONCE(q->c_head_ts, cb->ts); 428 return qdisc_enqueue_tail(skb, sch); 429 } 430 431 /* By default, dualpi2 will split GSO skbs into independent skbs and enqueue 432 * each of those individually. This yields the following benefits, at the 433 * expense of CPU usage: 434 * - Finer-grained AQM actions as the sub-packets of a burst no longer share the 435 * same fate (e.g., the random mark/drop probability is applied individually) 436 * - Improved precision of the starvation protection/WRR scheduler at dequeue, 437 * as the size of the dequeued packets will be smaller. 438 */ 439 static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, 440 struct sk_buff **to_free) 441 { 442 struct dualpi2_sched_data *q = qdisc_priv(sch); 443 int err; 444 445 err = dualpi2_skb_classify(q, skb); 446 if (err != NET_XMIT_SUCCESS) { 447 if (err & __NET_XMIT_BYPASS) 448 qdisc_qstats_drop(sch); 449 __qdisc_drop(skb, to_free); 450 return err; 451 } 452 453 if (q->split_gso && skb_is_gso(skb)) { 454 netdev_features_t features; 455 struct sk_buff *nskb, *next; 456 int cnt, byte_len, orig_len; 457 int err; 458 459 features = netif_skb_features(skb); 460 nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 461 if (IS_ERR_OR_NULL(nskb)) 462 return qdisc_drop(skb, sch, to_free); 463 464 cnt = 0; 465 byte_len = 0; 466 orig_len = qdisc_pkt_len(skb); 467 skb_list_walk_safe(nskb, nskb, next) { 468 skb_mark_not_on_list(nskb); 469 470 /* Iterate through GSO fragments of an skb: 471 * (1) Set pkt_len from the single GSO fragments 472 * (2) Copy classified and ect values of an skb 473 * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb 474 */ 475 qdisc_skb_cb(nskb)->pkt_len = nskb->len; 476 qdisc_skb_cb(nskb)->pkt_segs = 1; 477 dualpi2_skb_cb(nskb)->classified = 478 dualpi2_skb_cb(skb)->classified; 479 dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; 480 err = dualpi2_enqueue_skb(nskb, sch, to_free); 481 482 if (err == NET_XMIT_SUCCESS) { 483 /* Compute the backlog adjustment that needs 484 * to be propagated in the qdisc tree to reflect 485 * all new skbs successfully enqueued. 486 */ 487 ++cnt; 488 byte_len += nskb->len; 489 } 490 } 491 if (cnt > 0) { 492 /* The caller will add the original skb stats to its 493 * backlog, compensate this if any nskb is enqueued. 494 */ 495 qdisc_tree_reduce_backlog(sch, 1 - cnt, 496 orig_len - byte_len); 497 } 498 consume_skb(skb); 499 return cnt > 0 ? NET_XMIT_SUCCESS : err; 500 } 501 return dualpi2_enqueue_skb(skb, sch, to_free); 502 } 503 504 /* Select the queue from which the next packet can be dequeued, ensuring that 505 * neither queue can starve the other with a WRR scheduler. 506 * 507 * The sign of the WRR credit determines the next queue, while the size of 508 * the dequeued packet determines the magnitude of the WRR credit change. If 509 * either queue is empty, the WRR credit is kept unchanged. 510 * 511 * As the dequeued packet can be dropped later, the caller has to perform the 512 * qdisc_bstats_update() calls. 513 */ 514 static struct sk_buff *dequeue_packet(struct Qdisc *sch, 515 struct dualpi2_sched_data *q, 516 int *credit_change, 517 u64 now) 518 { 519 struct sk_buff *skb = NULL; 520 int c_len; 521 522 *credit_change = 0; 523 c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue); 524 if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) { 525 skb = __qdisc_dequeue_head(&q->l_queue->q); 526 WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue)); 527 if (c_len) 528 *credit_change = q->c_protection_wc; 529 qdisc_qstats_backlog_dec(q->l_queue, skb); 530 531 /* Keep the global queue size consistent */ 532 qdisc_qlen_dec(sch); 533 } else if (c_len) { 534 skb = __qdisc_dequeue_head(&sch->q); 535 WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch)); 536 if (qdisc_qlen(q->l_queue)) 537 *credit_change = ~((s32)q->c_protection_wl) + 1; 538 } else { 539 dualpi2_reset_c_protection(q); 540 return NULL; 541 } 542 WRITE_ONCE(q->memory_used, q->memory_used - skb->truesize); 543 *credit_change *= qdisc_pkt_len(skb); 544 qdisc_qstats_backlog_dec(sch, skb); 545 return skb; 546 } 547 548 static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb, 549 u64 now) 550 { 551 u64 qdelay = 0; 552 553 if (q->step_in_packets) 554 qdelay = qdisc_qlen(q->l_queue); 555 else 556 qdelay = dualpi2_sojourn_time(skb, now); 557 558 if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) { 559 if (!dualpi2_skb_cb(skb)->ect) { 560 /* Drop this non-ECT packet */ 561 return 1; 562 } 563 564 if (dualpi2_mark(q, skb)) 565 WRITE_ONCE(q->step_marks, q->step_marks + 1); 566 } 567 qdisc_bstats_update(q->l_queue, skb); 568 return 0; 569 } 570 571 static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb, 572 struct Qdisc *sch, enum qdisc_drop_reason reason) 573 { 574 ++q->deferred_drops_cnt; 575 q->deferred_drops_len += qdisc_pkt_len(skb); 576 qdisc_dequeue_drop(sch, skb, reason); 577 qdisc_qstats_drop(sch); 578 } 579 580 static struct sk_buff *__dualpi2_qdisc_dequeue(struct Qdisc *sch) 581 { 582 struct dualpi2_sched_data *q = qdisc_priv(sch); 583 struct sk_buff *skb; 584 int credit_change; 585 u64 now; 586 587 now = ktime_get_ns(); 588 589 while ((skb = dequeue_packet(sch, q, &credit_change, now))) { 590 if (!q->drop_early && must_drop(sch, q, skb)) { 591 drop_and_retry(q, skb, sch, QDISC_DROP_CONGESTED); 592 continue; 593 } 594 595 if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) { 596 qdisc_qstats_drop(q->l_queue); 597 drop_and_retry(q, skb, sch, QDISC_DROP_L4S_STEP_NON_ECN); 598 continue; 599 } 600 601 WRITE_ONCE(q->c_protection_credit, 602 q->c_protection_credit + credit_change); 603 qdisc_bstats_update(sch, skb); 604 break; 605 } 606 607 return skb; 608 } 609 610 static void dualpi2_dequeue_drop(struct Qdisc *sch) 611 { 612 struct dualpi2_sched_data *q = qdisc_priv(sch); 613 614 if (q->deferred_drops_cnt) { 615 qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt, 616 q->deferred_drops_len); 617 q->deferred_drops_cnt = 0; 618 q->deferred_drops_len = 0; 619 } 620 } 621 622 static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) 623 { 624 struct sk_buff *skb; 625 626 skb = __dualpi2_qdisc_dequeue(sch); 627 628 dualpi2_dequeue_drop(sch); 629 630 return skb; 631 } 632 633 static struct sk_buff *dualpi2_peek(struct Qdisc *sch) 634 { 635 struct sk_buff *skb = skb_peek(&sch->gso_skb); 636 637 if (!skb) { 638 skb = __dualpi2_qdisc_dequeue(sch); 639 640 if (skb) { 641 __skb_queue_head(&sch->gso_skb, skb); 642 /* it's still part of the queue */ 643 qdisc_qstats_backlog_inc(sch, skb); 644 sch->q.qlen++; 645 } 646 647 dualpi2_dequeue_drop(sch); 648 } 649 650 return skb; 651 } 652 653 static s64 __scale_delta(u64 diff) 654 { 655 do_div(diff, 1 << ALPHA_BETA_GRANULARITY); 656 return diff; 657 } 658 659 static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, 660 u64 *qdelay_l) 661 { 662 u64 now, qc, ql; 663 664 now = ktime_get_ns(); 665 qc = READ_ONCE(q->c_head_ts); 666 ql = READ_ONCE(q->l_head_ts); 667 668 *qdelay_c = qc ? now - qc : 0; 669 *qdelay_l = ql ? now - ql : 0; 670 } 671 672 static u32 calculate_probability(struct Qdisc *sch) 673 { 674 struct dualpi2_sched_data *q = qdisc_priv(sch); 675 u32 new_prob; 676 u64 qdelay_c; 677 u64 qdelay_l; 678 u64 qdelay; 679 s64 delta; 680 681 get_queue_delays(q, &qdelay_c, &qdelay_l); 682 qdelay = max(qdelay_l, qdelay_c); 683 684 /* Alpha and beta take at most 32b, i.e, the delay difference would 685 * overflow for queuing delay differences > ~4.2sec. 686 */ 687 delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha; 688 delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta; 689 q->last_qdelay = qdelay; 690 691 /* Bound new_prob between 0 and MAX_PROB */ 692 if (delta > 0) { 693 new_prob = __scale_delta(delta) + q->pi2_prob; 694 if (new_prob < q->pi2_prob) 695 new_prob = MAX_PROB; 696 } else { 697 new_prob = q->pi2_prob - __scale_delta(~delta + 1); 698 if (new_prob > q->pi2_prob) 699 new_prob = 0; 700 } 701 702 /* If we do not drop on overload, ensure we cap the L4S probability to 703 * 100% to keep window fairness when overflowing. 704 */ 705 if (!q->drop_overload) 706 return min_t(u32, new_prob, MAX_PROB / q->coupling_factor); 707 return new_prob; 708 } 709 710 static u32 get_memory_limit(struct Qdisc *sch, u32 limit) 711 { 712 /* Apply rule of thumb, i.e., doubling the packet length, 713 * to further include per packet overhead in memory_limit. 714 */ 715 u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch))); 716 717 if (upper_32_bits(memlim)) 718 return U32_MAX; 719 else 720 return lower_32_bits(memlim); 721 } 722 723 static u32 convert_us_to_nsec(u32 us) 724 { 725 u64 ns = mul_u32_u32(us, NSEC_PER_USEC); 726 727 if (upper_32_bits(ns)) 728 return U32_MAX; 729 730 return lower_32_bits(ns); 731 } 732 733 static u32 convert_ns_to_usec(u64 ns) 734 { 735 do_div(ns, NSEC_PER_USEC); 736 if (upper_32_bits(ns)) 737 return U32_MAX; 738 739 return lower_32_bits(ns); 740 } 741 742 static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) 743 { 744 struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer); 745 struct Qdisc *sch = q->sch; 746 spinlock_t *root_lock; /* to lock qdisc for probability calculations */ 747 748 rcu_read_lock(); 749 root_lock = qdisc_lock(qdisc_root_sleeping(sch)); 750 spin_lock(root_lock); 751 752 WRITE_ONCE(q->pi2_prob, calculate_probability(sch)); 753 hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); 754 755 spin_unlock(root_lock); 756 rcu_read_unlock(); 757 return HRTIMER_RESTART; 758 } 759 760 static struct netlink_range_validation dualpi2_alpha_beta_range = { 761 .min = 1, 762 .max = ALPHA_BETA_MAX, 763 }; 764 765 static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = { 766 [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), 767 [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), 768 [TCA_DUALPI2_TARGET] = { .type = NLA_U32 }, 769 [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1), 770 [TCA_DUALPI2_ALPHA] = 771 NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), 772 [TCA_DUALPI2_BETA] = 773 NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), 774 [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 }, 775 [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 }, 776 [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 }, 777 [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1), 778 [TCA_DUALPI2_DROP_OVERLOAD] = 779 NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX), 780 [TCA_DUALPI2_DROP_EARLY] = 781 NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX), 782 [TCA_DUALPI2_C_PROTECTION] = 783 NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC), 784 [TCA_DUALPI2_ECN_MASK] = 785 NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT, 786 TCA_DUALPI2_ECN_MASK_MAX), 787 [TCA_DUALPI2_SPLIT_GSO] = 788 NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX), 789 }; 790 791 static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, 792 struct netlink_ext_ack *extack) 793 { 794 struct nlattr *tb[TCA_DUALPI2_MAX + 1]; 795 struct dualpi2_sched_data *q; 796 int old_backlog; 797 int old_qlen; 798 int err; 799 800 if (!opt || !nla_len(opt)) { 801 NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required"); 802 return -EINVAL; 803 } 804 err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy, 805 extack); 806 if (err < 0) 807 return err; 808 if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) { 809 NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes"); 810 return -EINVAL; 811 } 812 813 q = qdisc_priv(sch); 814 sch_tree_lock(sch); 815 816 if (tb[TCA_DUALPI2_LIMIT]) { 817 u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]); 818 819 WRITE_ONCE(sch->limit, limit); 820 WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit)); 821 } 822 823 if (tb[TCA_DUALPI2_MEMORY_LIMIT]) 824 WRITE_ONCE(q->memory_limit, 825 nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT])); 826 827 if (tb[TCA_DUALPI2_TARGET]) { 828 u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]); 829 830 WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC); 831 } 832 833 if (tb[TCA_DUALPI2_TUPDATE]) { 834 u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]); 835 836 WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate)); 837 } 838 839 if (tb[TCA_DUALPI2_ALPHA]) { 840 u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]); 841 842 WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha)); 843 } 844 845 if (tb[TCA_DUALPI2_BETA]) { 846 u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]); 847 848 WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta)); 849 } 850 851 if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) { 852 u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]); 853 854 WRITE_ONCE(q->step_in_packets, true); 855 WRITE_ONCE(q->step_thresh, step_th); 856 } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) { 857 u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]); 858 859 WRITE_ONCE(q->step_in_packets, false); 860 WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th)); 861 } 862 863 if (tb[TCA_DUALPI2_MIN_QLEN_STEP]) 864 WRITE_ONCE(q->min_qlen_step, 865 nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP])); 866 867 if (tb[TCA_DUALPI2_COUPLING]) { 868 u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]); 869 870 WRITE_ONCE(q->coupling_factor, coupling); 871 } 872 873 if (tb[TCA_DUALPI2_DROP_OVERLOAD]) { 874 u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]); 875 876 WRITE_ONCE(q->drop_overload, (bool)drop_overload); 877 } 878 879 if (tb[TCA_DUALPI2_DROP_EARLY]) { 880 u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]); 881 882 WRITE_ONCE(q->drop_early, (bool)drop_early); 883 } 884 885 if (tb[TCA_DUALPI2_C_PROTECTION]) { 886 u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]); 887 888 dualpi2_calculate_c_protection(sch, q, wc); 889 } 890 891 if (tb[TCA_DUALPI2_ECN_MASK]) { 892 u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]); 893 894 WRITE_ONCE(q->ecn_mask, ecn_mask); 895 } 896 897 if (tb[TCA_DUALPI2_SPLIT_GSO]) { 898 u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]); 899 900 WRITE_ONCE(q->split_gso, (bool)split_gso); 901 } 902 903 old_qlen = qdisc_qlen(sch); 904 old_backlog = sch->qstats.backlog; 905 while (qdisc_qlen(sch) > sch->limit || 906 q->memory_used > q->memory_limit) { 907 struct sk_buff *skb = NULL; 908 909 if (qdisc_qlen(sch) > qdisc_qlen(q->l_queue)) { 910 skb = qdisc_dequeue_internal(sch, true); 911 if (unlikely(!skb)) { 912 WARN_ON_ONCE(1); 913 break; 914 } 915 WRITE_ONCE(q->memory_used, q->memory_used - skb->truesize); 916 rtnl_qdisc_drop(skb, sch); 917 } else if (qdisc_qlen(q->l_queue)) { 918 skb = qdisc_dequeue_internal(q->l_queue, true); 919 if (unlikely(!skb)) { 920 WARN_ON_ONCE(1); 921 break; 922 } 923 /* L-queue packets are counted in both sch and 924 * l_queue on enqueue; qdisc_dequeue_internal() 925 * handled l_queue, so we further account for sch. 926 */ 927 qdisc_qlen_dec(sch); 928 qdisc_qstats_backlog_dec(sch, skb); 929 WRITE_ONCE(q->memory_used, q->memory_used - skb->truesize); 930 rtnl_qdisc_drop(skb, q->l_queue); 931 qdisc_qstats_drop(sch); 932 } else { 933 WARN_ON_ONCE(1); 934 break; 935 } 936 } 937 qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch), 938 old_backlog - sch->qstats.backlog); 939 940 sch_tree_unlock(sch); 941 return 0; 942 } 943 944 /* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */ 945 static void dualpi2_reset_default(struct Qdisc *sch) 946 { 947 struct dualpi2_sched_data *q = qdisc_priv(sch); 948 949 q->sch->limit = 10000; /* Max 125ms at 1Gbps */ 950 q->memory_limit = get_memory_limit(sch, q->sch->limit); 951 952 q->pi2_target = 15 * NSEC_PER_MSEC; 953 q->pi2_tupdate = 16 * NSEC_PER_MSEC; 954 q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */ 955 q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */ 956 957 q->step_thresh = 1 * NSEC_PER_MSEC; 958 q->step_in_packets = false; 959 960 dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */ 961 962 q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */ 963 q->min_qlen_step = 0; /* Always apply step mark in L-queue */ 964 q->coupling_factor = 2; /* window fairness for equal RTTs */ 965 q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */ 966 q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */ 967 q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */ 968 } 969 970 static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, 971 struct netlink_ext_ack *extack) 972 { 973 struct dualpi2_sched_data *q = qdisc_priv(sch); 974 int err; 975 976 sch->flags |= TCQ_F_DEQUEUE_DROPS; 977 hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, 978 HRTIMER_MODE_ABS_PINNED_SOFT); 979 980 q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, 981 TC_H_MAKE(sch->handle, 1), extack); 982 if (!q->l_queue) 983 return -ENOMEM; 984 985 err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack); 986 if (err) 987 return err; 988 989 q->sch = sch; 990 dualpi2_reset_default(sch); 991 992 if (opt && nla_len(opt)) { 993 err = dualpi2_change(sch, opt, extack); 994 995 if (err) 996 return err; 997 } 998 999 hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), 1000 HRTIMER_MODE_ABS_PINNED_SOFT); 1001 return 0; 1002 } 1003 1004 static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb) 1005 { 1006 struct dualpi2_sched_data *q = qdisc_priv(sch); 1007 struct nlattr *opts; 1008 bool step_in_pkts; 1009 u32 step_th; 1010 1011 step_in_pkts = READ_ONCE(q->step_in_packets); 1012 step_th = READ_ONCE(q->step_thresh); 1013 1014 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 1015 if (!opts) 1016 goto nla_put_failure; 1017 1018 if (step_in_pkts && 1019 (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || 1020 nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, 1021 READ_ONCE(q->memory_limit)) || 1022 nla_put_u32(skb, TCA_DUALPI2_TARGET, 1023 convert_ns_to_usec(READ_ONCE(q->pi2_target))) || 1024 nla_put_u32(skb, TCA_DUALPI2_TUPDATE, 1025 convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || 1026 nla_put_u32(skb, TCA_DUALPI2_ALPHA, 1027 dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || 1028 nla_put_u32(skb, TCA_DUALPI2_BETA, 1029 dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || 1030 nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) || 1031 nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, 1032 READ_ONCE(q->min_qlen_step)) || 1033 nla_put_u8(skb, TCA_DUALPI2_COUPLING, 1034 READ_ONCE(q->coupling_factor)) || 1035 nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, 1036 READ_ONCE(q->drop_overload)) || 1037 nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, 1038 READ_ONCE(q->drop_early)) || 1039 nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, 1040 READ_ONCE(q->c_protection_wc)) || 1041 nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || 1042 nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) 1043 goto nla_put_failure; 1044 1045 if (!step_in_pkts && 1046 (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || 1047 nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, 1048 READ_ONCE(q->memory_limit)) || 1049 nla_put_u32(skb, TCA_DUALPI2_TARGET, 1050 convert_ns_to_usec(READ_ONCE(q->pi2_target))) || 1051 nla_put_u32(skb, TCA_DUALPI2_TUPDATE, 1052 convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || 1053 nla_put_u32(skb, TCA_DUALPI2_ALPHA, 1054 dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || 1055 nla_put_u32(skb, TCA_DUALPI2_BETA, 1056 dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || 1057 nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US, 1058 convert_ns_to_usec(step_th)) || 1059 nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, 1060 READ_ONCE(q->min_qlen_step)) || 1061 nla_put_u8(skb, TCA_DUALPI2_COUPLING, 1062 READ_ONCE(q->coupling_factor)) || 1063 nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, 1064 READ_ONCE(q->drop_overload)) || 1065 nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, 1066 READ_ONCE(q->drop_early)) || 1067 nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, 1068 READ_ONCE(q->c_protection_wc)) || 1069 nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || 1070 nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) 1071 goto nla_put_failure; 1072 1073 return nla_nest_end(skb, opts); 1074 1075 nla_put_failure: 1076 nla_nest_cancel(skb, opts); 1077 return -1; 1078 } 1079 1080 static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 1081 { 1082 struct dualpi2_sched_data *q = qdisc_priv(sch); 1083 struct tc_dualpi2_xstats st = { 1084 .prob = READ_ONCE(q->pi2_prob), 1085 .packets_in_c = READ_ONCE(q->packets_in_c), 1086 .packets_in_l = READ_ONCE(q->packets_in_l), 1087 .maxq = READ_ONCE(q->maxq), 1088 .ecn_mark = READ_ONCE(q->ecn_mark), 1089 .credit = READ_ONCE(q->c_protection_credit), 1090 .step_marks = READ_ONCE(q->step_marks), 1091 .memory_used = READ_ONCE(q->memory_used), 1092 .max_memory_used = READ_ONCE(q->max_memory_used), 1093 .memory_limit = READ_ONCE(q->memory_limit), 1094 }; 1095 u64 qc, ql; 1096 1097 get_queue_delays(q, &qc, &ql); 1098 st.delay_l = convert_ns_to_usec(ql); 1099 st.delay_c = convert_ns_to_usec(qc); 1100 return gnet_stats_copy_app(d, &st, sizeof(st)); 1101 } 1102 1103 /* Reset both L-queue and C-queue, internal packet counters, PI probability, 1104 * C-queue protection credit, and timestamps, while preserving current 1105 * configuration of DUALPI2. 1106 */ 1107 static void dualpi2_reset(struct Qdisc *sch) 1108 { 1109 struct dualpi2_sched_data *q = qdisc_priv(sch); 1110 1111 qdisc_reset_queue(sch); 1112 qdisc_reset_queue(q->l_queue); 1113 WRITE_ONCE(q->c_head_ts, 0); 1114 WRITE_ONCE(q->l_head_ts, 0); 1115 WRITE_ONCE(q->pi2_prob, 0); 1116 WRITE_ONCE(q->packets_in_c, 0); 1117 WRITE_ONCE(q->packets_in_l, 0); 1118 WRITE_ONCE(q->maxq, 0); 1119 WRITE_ONCE(q->ecn_mark, 0); 1120 WRITE_ONCE(q->step_marks, 0); 1121 WRITE_ONCE(q->memory_used, 0); 1122 WRITE_ONCE(q->max_memory_used, 0); 1123 dualpi2_reset_c_protection(q); 1124 } 1125 1126 static void dualpi2_destroy(struct Qdisc *sch) 1127 { 1128 struct dualpi2_sched_data *q = qdisc_priv(sch); 1129 1130 q->pi2_tupdate = 0; 1131 hrtimer_cancel(&q->pi2_timer); 1132 if (q->l_queue) 1133 qdisc_put(q->l_queue); 1134 tcf_block_put(q->tcf_block); 1135 } 1136 1137 static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg) 1138 { 1139 return NULL; 1140 } 1141 1142 static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid) 1143 { 1144 return 0; 1145 } 1146 1147 static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent, 1148 u32 classid) 1149 { 1150 return 0; 1151 } 1152 1153 static void dualpi2_unbind(struct Qdisc *q, unsigned long cl) 1154 { 1155 } 1156 1157 static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl, 1158 struct netlink_ext_ack *extack) 1159 { 1160 struct dualpi2_sched_data *q = qdisc_priv(sch); 1161 1162 if (cl) 1163 return NULL; 1164 return q->tcf_block; 1165 } 1166 1167 static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg) 1168 { 1169 unsigned int i; 1170 1171 if (arg->stop) 1172 return; 1173 1174 /* We statically define only 2 queues */ 1175 for (i = 0; i < 2; i++) { 1176 if (arg->count < arg->skip) { 1177 arg->count++; 1178 continue; 1179 } 1180 if (arg->fn(sch, i + 1, arg) < 0) { 1181 arg->stop = 1; 1182 break; 1183 } 1184 arg->count++; 1185 } 1186 } 1187 1188 /* Minimal class support to handle tc filters */ 1189 static const struct Qdisc_class_ops dualpi2_class_ops = { 1190 .leaf = dualpi2_leaf, 1191 .find = dualpi2_find, 1192 .tcf_block = dualpi2_tcf_block, 1193 .bind_tcf = dualpi2_bind, 1194 .unbind_tcf = dualpi2_unbind, 1195 .walk = dualpi2_walk, 1196 }; 1197 1198 static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { 1199 .id = "dualpi2", 1200 .cl_ops = &dualpi2_class_ops, 1201 .priv_size = sizeof(struct dualpi2_sched_data), 1202 .enqueue = dualpi2_qdisc_enqueue, 1203 .dequeue = dualpi2_qdisc_dequeue, 1204 .peek = dualpi2_peek, 1205 .init = dualpi2_init, 1206 .destroy = dualpi2_destroy, 1207 .reset = dualpi2_reset, 1208 .change = dualpi2_change, 1209 .dump = dualpi2_dump, 1210 .dump_stats = dualpi2_dump_stats, 1211 .owner = THIS_MODULE, 1212 }; 1213 MODULE_ALIAS_NET_SCH("dualpi2"); 1214 1215 static int __init dualpi2_module_init(void) 1216 { 1217 return register_qdisc(&dualpi2_qdisc_ops); 1218 } 1219 1220 static void __exit dualpi2_module_exit(void) 1221 { 1222 unregister_qdisc(&dualpi2_qdisc_ops); 1223 } 1224 1225 module_init(dualpi2_module_init); 1226 module_exit(dualpi2_module_exit); 1227 1228 MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler"); 1229 MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>"); 1230 MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>"); 1231 MODULE_AUTHOR("Olga Albisser <olga@albisser.org>"); 1232 MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>"); 1233 MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>"); 1234 1235 MODULE_LICENSE("Dual BSD/GPL"); 1236 MODULE_VERSION("1.0"); 1237