1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. 4 * 5 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. 6 * Copyright (c) 2012 Paolo Valente. 7 */ 8 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/bitops.h> 12 #include <linux/errno.h> 13 #include <linux/netdevice.h> 14 #include <linux/pkt_sched.h> 15 #include <net/sch_generic.h> 16 #include <net/pkt_sched.h> 17 #include <net/pkt_cls.h> 18 19 20 /* Quick Fair Queueing Plus 21 ======================== 22 23 Sources: 24 25 [1] Paolo Valente, 26 "Reducing the Execution Time of Fair-Queueing Schedulers." 27 http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf 28 29 Sources for QFQ: 30 31 [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient 32 Packet Scheduling with Tight Bandwidth Distribution Guarantees." 33 34 See also: 35 http://retis.sssup.it/~fabio/linux/qfq/ 36 */ 37 38 /* 39 40 QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES 41 classes. Each aggregate is timestamped with a virtual start time S 42 and a virtual finish time F, and scheduled according to its 43 timestamps. S and F are computed as a function of a system virtual 44 time function V. The classes within each aggregate are instead 45 scheduled with DRR. 46 47 To speed up operations, QFQ+ divides also aggregates into a limited 48 number of groups. Which group a class belongs to depends on the 49 ratio between the maximum packet length for the class and the weight 50 of the class. Groups have their own S and F. In the end, QFQ+ 51 schedules groups, then aggregates within groups, then classes within 52 aggregates. See [1] and [2] for a full description. 53 54 Virtual time computations. 55 56 S, F and V are all computed in fixed point arithmetic with 57 FRAC_BITS decimal bits. 58 59 QFQ_MAX_INDEX is the maximum index allowed for a group. We need 60 one bit per index. 61 QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. 62 63 The layout of the bits is as below: 64 65 [ MTU_SHIFT ][ FRAC_BITS ] 66 [ MAX_INDEX ][ MIN_SLOT_SHIFT ] 67 ^.__grp->index = 0 68 *.__grp->slot_shift 69 70 where MIN_SLOT_SHIFT is derived by difference from the others. 71 72 The max group index corresponds to Lmax/w_min, where 73 Lmax=1<<MTU_SHIFT, w_min = 1 . 74 From this, and knowing how many groups (MAX_INDEX) we want, 75 we can derive the shift corresponding to each group. 76 77 Because we often need to compute 78 F = S + len/w_i and V = V + len/wsum 79 instead of storing w_i store the value 80 inv_w = (1<<FRAC_BITS)/w_i 81 so we can do F = S + len * inv_w * wsum. 82 We use W_TOT in the formulas so we can easily move between 83 static and adaptive weight sum. 84 85 The per-scheduler-instance data contain all the data structures 86 for the scheduler: bitmaps and bucket lists. 87 88 */ 89 90 /* 91 * Maximum number of consecutive slots occupied by backlogged classes 92 * inside a group. 93 */ 94 #define QFQ_MAX_SLOTS 32 95 96 /* 97 * Shifts used for aggregate<->group mapping. We allow class weights that are 98 * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the 99 * group with the smallest index that can support the L_i / r_i configured 100 * for the classes in the aggregate. 101 * 102 * grp->index is the index of the group; and grp->slot_shift 103 * is the shift for the corresponding (scaled) sigma_i. 104 */ 105 #define QFQ_MAX_INDEX 24 106 #define QFQ_MAX_WSHIFT 10 107 108 #define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ 109 #define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT) 110 111 #define FRAC_BITS 30 /* fixed point arithmetic */ 112 #define ONE_FP (1UL << FRAC_BITS) 113 114 #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ 115 #define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ 116 #define QFQ_MAX_LMAX (1UL << QFQ_MTU_SHIFT) 117 118 #define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ 119 120 /* 121 * Possible group states. These values are used as indexes for the bitmaps 122 * array of struct qfq_queue. 123 */ 124 enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; 125 126 struct qfq_group; 127 128 struct qfq_aggregate; 129 130 struct qfq_class { 131 struct Qdisc_class_common common; 132 133 struct gnet_stats_basic_sync bstats; 134 struct gnet_stats_queue qstats; 135 struct net_rate_estimator __rcu *rate_est; 136 struct Qdisc *qdisc; 137 struct list_head alist; /* Link for active-classes list. */ 138 struct qfq_aggregate *agg; /* Parent aggregate. */ 139 int deficit; /* DRR deficit counter. */ 140 }; 141 142 struct qfq_aggregate { 143 struct hlist_node next; /* Link for the slot list. */ 144 u64 S, F; /* flow timestamps (exact) */ 145 146 /* group we belong to. In principle we would need the index, 147 * which is log_2(lmax/weight), but we never reference it 148 * directly, only the group. 149 */ 150 struct qfq_group *grp; 151 152 /* these are copied from the flowset. */ 153 u32 class_weight; /* Weight of each class in this aggregate. */ 154 /* Max pkt size for the classes in this aggregate, DRR quantum. */ 155 int lmax; 156 157 u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */ 158 u32 budgetmax; /* Max budget for this aggregate. */ 159 u32 initial_budget, budget; /* Initial and current budget. */ 160 161 int num_classes; /* Number of classes in this aggr. */ 162 struct list_head active; /* DRR queue of active classes. */ 163 164 struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */ 165 }; 166 167 struct qfq_group { 168 u64 S, F; /* group timestamps (approx). */ 169 unsigned int slot_shift; /* Slot shift. */ 170 unsigned int index; /* Group index. */ 171 unsigned int front; /* Index of the front slot. */ 172 unsigned long full_slots; /* non-empty slots */ 173 174 /* Array of RR lists of active aggregates. */ 175 struct hlist_head slots[QFQ_MAX_SLOTS]; 176 }; 177 178 struct qfq_sched { 179 struct tcf_proto __rcu *filter_list; 180 struct tcf_block *block; 181 struct Qdisc_class_hash clhash; 182 183 u64 oldV, V; /* Precise virtual times. */ 184 struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ 185 u32 wsum; /* weight sum */ 186 u32 iwsum; /* inverse weight sum */ 187 188 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ 189 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ 190 u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ 191 192 u32 max_agg_classes; /* Max number of classes per aggr. */ 193 struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ 194 }; 195 196 /* 197 * Possible reasons why the timestamps of an aggregate are updated 198 * enqueue: the aggregate switches from idle to active and must scheduled 199 * for service 200 * requeue: the aggregate finishes its budget, so it stops being served and 201 * must be rescheduled for service 202 */ 203 enum update_reason {enqueue, requeue}; 204 205 static bool cl_is_active(struct qfq_class *cl) 206 { 207 return !list_empty(&cl->alist); 208 } 209 210 static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) 211 { 212 struct qfq_sched *q = qdisc_priv(sch); 213 struct Qdisc_class_common *clc; 214 215 clc = qdisc_class_find(&q->clhash, classid); 216 if (clc == NULL) 217 return NULL; 218 return container_of(clc, struct qfq_class, common); 219 } 220 221 static const struct netlink_range_validation lmax_range = { 222 .min = QFQ_MIN_LMAX, 223 .max = QFQ_MAX_LMAX, 224 }; 225 226 static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { 227 [TCA_QFQ_WEIGHT] = NLA_POLICY_RANGE(NLA_U32, 1, QFQ_MAX_WEIGHT), 228 [TCA_QFQ_LMAX] = NLA_POLICY_FULL_RANGE(NLA_U32, &lmax_range), 229 }; 230 231 /* 232 * Calculate a flow index, given its weight and maximum packet length. 233 * index = log_2(maxlen/weight) but we need to apply the scaling. 234 * This is used only once at flow creation. 235 */ 236 static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) 237 { 238 u64 slot_size = (u64)maxlen * inv_w; 239 unsigned long size_map; 240 int index = 0; 241 242 size_map = slot_size >> min_slot_shift; 243 if (!size_map) 244 goto out; 245 246 index = __fls(size_map) + 1; /* basically a log_2 */ 247 index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); 248 249 if (index < 0) 250 index = 0; 251 out: 252 pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n", 253 (unsigned long) ONE_FP/inv_w, maxlen, index); 254 255 return index; 256 } 257 258 static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); 259 static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, 260 enum update_reason); 261 262 static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, 263 u32 lmax, u32 weight) 264 { 265 INIT_LIST_HEAD(&agg->active); 266 hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); 267 268 agg->lmax = lmax; 269 agg->class_weight = weight; 270 } 271 272 static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, 273 u32 lmax, u32 weight) 274 { 275 struct qfq_aggregate *agg; 276 277 hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) 278 if (agg->lmax == lmax && agg->class_weight == weight) 279 return agg; 280 281 return NULL; 282 } 283 284 285 /* Update aggregate as a function of the new number of classes. */ 286 static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, 287 int new_num_classes) 288 { 289 u32 new_agg_weight; 290 291 if (new_num_classes == q->max_agg_classes) 292 hlist_del_init(&agg->nonfull_next); 293 294 if (agg->num_classes > new_num_classes && 295 new_num_classes == q->max_agg_classes - 1) /* agg no more full */ 296 hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); 297 298 /* The next assignment may let 299 * agg->initial_budget > agg->budgetmax 300 * hold, we will take it into account in charge_actual_service(). 301 */ 302 agg->budgetmax = new_num_classes * agg->lmax; 303 new_agg_weight = agg->class_weight * new_num_classes; 304 agg->inv_w = ONE_FP/new_agg_weight; 305 306 if (agg->grp == NULL) { 307 int i = qfq_calc_index(agg->inv_w, agg->budgetmax, 308 q->min_slot_shift); 309 agg->grp = &q->groups[i]; 310 } 311 312 q->wsum += 313 (int) agg->class_weight * (new_num_classes - agg->num_classes); 314 q->iwsum = ONE_FP / q->wsum; 315 316 agg->num_classes = new_num_classes; 317 } 318 319 /* Add class to aggregate. */ 320 static void qfq_add_to_agg(struct qfq_sched *q, 321 struct qfq_aggregate *agg, 322 struct qfq_class *cl) 323 { 324 cl->agg = agg; 325 326 qfq_update_agg(q, agg, agg->num_classes+1); 327 if (cl->qdisc->q.qlen > 0) { /* adding an active class */ 328 list_add_tail(&cl->alist, &agg->active); 329 if (list_first_entry(&agg->active, struct qfq_class, alist) == 330 cl && q->in_serv_agg != agg) /* agg was inactive */ 331 qfq_activate_agg(q, agg, enqueue); /* schedule agg */ 332 } 333 } 334 335 static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); 336 337 static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) 338 { 339 hlist_del_init(&agg->nonfull_next); 340 q->wsum -= agg->class_weight; 341 if (q->wsum != 0) 342 q->iwsum = ONE_FP / q->wsum; 343 344 if (q->in_serv_agg == agg) 345 q->in_serv_agg = qfq_choose_next_agg(q); 346 kfree(agg); 347 } 348 349 /* Deschedule class from within its parent aggregate. */ 350 static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) 351 { 352 struct qfq_aggregate *agg = cl->agg; 353 354 355 list_del_init(&cl->alist); /* remove from RR queue of the aggregate */ 356 if (list_empty(&agg->active)) /* agg is now inactive */ 357 qfq_deactivate_agg(q, agg); 358 } 359 360 /* Remove class from its parent aggregate. */ 361 static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) 362 { 363 struct qfq_aggregate *agg = cl->agg; 364 365 cl->agg = NULL; 366 if (agg->num_classes == 1) { /* agg being emptied, destroy it */ 367 qfq_destroy_agg(q, agg); 368 return; 369 } 370 qfq_update_agg(q, agg, agg->num_classes-1); 371 } 372 373 /* Deschedule class and remove it from its parent aggregate. */ 374 static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) 375 { 376 if (cl->qdisc->q.qlen > 0) /* class is active */ 377 qfq_deactivate_class(q, cl); 378 379 qfq_rm_from_agg(q, cl); 380 } 381 382 /* Move class to a new aggregate, matching the new class weight and/or lmax */ 383 static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, 384 u32 lmax) 385 { 386 struct qfq_sched *q = qdisc_priv(sch); 387 struct qfq_aggregate *new_agg; 388 389 /* 'lmax' can range from [QFQ_MIN_LMAX, pktlen + stab overhead] */ 390 if (lmax > QFQ_MAX_LMAX) 391 return -EINVAL; 392 393 new_agg = qfq_find_agg(q, lmax, weight); 394 if (new_agg == NULL) { /* create new aggregate */ 395 new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); 396 if (new_agg == NULL) 397 return -ENOBUFS; 398 qfq_init_agg(q, new_agg, lmax, weight); 399 } 400 qfq_deact_rm_from_agg(q, cl); 401 qfq_add_to_agg(q, new_agg, cl); 402 403 return 0; 404 } 405 406 static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 407 struct nlattr **tca, unsigned long *arg, 408 struct netlink_ext_ack *extack) 409 { 410 struct qfq_sched *q = qdisc_priv(sch); 411 struct qfq_class *cl = (struct qfq_class *)*arg; 412 bool existing = false; 413 struct nlattr *tb[TCA_QFQ_MAX + 1]; 414 struct qfq_aggregate *new_agg = NULL; 415 u32 weight, lmax, inv_w, old_weight, old_lmax; 416 int err; 417 int delta_w; 418 419 if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) { 420 NL_SET_ERR_MSG_MOD(extack, "missing options"); 421 return -EINVAL; 422 } 423 424 err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], 425 qfq_policy, extack); 426 if (err < 0) 427 return err; 428 429 weight = nla_get_u32_default(tb[TCA_QFQ_WEIGHT], 1); 430 431 if (tb[TCA_QFQ_LMAX]) { 432 lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); 433 } else { 434 /* MTU size is user controlled */ 435 lmax = psched_mtu(qdisc_dev(sch)); 436 if (lmax < QFQ_MIN_LMAX || lmax > QFQ_MAX_LMAX) { 437 NL_SET_ERR_MSG_MOD(extack, 438 "MTU size out of bounds for qfq"); 439 return -EINVAL; 440 } 441 } 442 443 inv_w = ONE_FP / weight; 444 weight = ONE_FP / inv_w; 445 446 if (cl != NULL) { 447 sch_tree_lock(sch); 448 old_weight = cl->agg->class_weight; 449 old_lmax = cl->agg->lmax; 450 sch_tree_unlock(sch); 451 if (lmax == old_lmax && weight == old_weight) 452 return 0; /* nothing to change */ 453 } 454 455 delta_w = weight - (cl ? old_weight : 0); 456 457 if (q->wsum + delta_w > QFQ_MAX_WSUM) { 458 NL_SET_ERR_MSG_FMT_MOD(extack, 459 "total weight out of range (%d + %u)", 460 delta_w, q->wsum); 461 return -EINVAL; 462 } 463 464 if (cl != NULL) { /* modify existing class */ 465 if (tca[TCA_RATE]) { 466 err = gen_replace_estimator(&cl->bstats, NULL, 467 &cl->rate_est, 468 NULL, 469 true, 470 tca[TCA_RATE]); 471 if (err) 472 return err; 473 } 474 existing = true; 475 goto set_change_agg; 476 } 477 478 /* create and init new class */ 479 cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); 480 if (cl == NULL) 481 return -ENOBUFS; 482 483 gnet_stats_basic_sync_init(&cl->bstats); 484 cl->common.classid = classid; 485 cl->deficit = lmax; 486 INIT_LIST_HEAD(&cl->alist); 487 488 cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, 489 classid, NULL); 490 if (cl->qdisc == NULL) 491 cl->qdisc = &noop_qdisc; 492 493 if (tca[TCA_RATE]) { 494 err = gen_new_estimator(&cl->bstats, NULL, 495 &cl->rate_est, 496 NULL, 497 true, 498 tca[TCA_RATE]); 499 if (err) 500 goto destroy_class; 501 } 502 503 if (cl->qdisc != &noop_qdisc) 504 qdisc_hash_add(cl->qdisc, true); 505 506 set_change_agg: 507 sch_tree_lock(sch); 508 new_agg = qfq_find_agg(q, lmax, weight); 509 if (new_agg == NULL) { /* create new aggregate */ 510 sch_tree_unlock(sch); 511 new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); 512 if (new_agg == NULL) { 513 err = -ENOBUFS; 514 gen_kill_estimator(&cl->rate_est); 515 goto destroy_class; 516 } 517 sch_tree_lock(sch); 518 qfq_init_agg(q, new_agg, lmax, weight); 519 } 520 if (existing) 521 qfq_deact_rm_from_agg(q, cl); 522 else 523 qdisc_class_hash_insert(&q->clhash, &cl->common); 524 qfq_add_to_agg(q, new_agg, cl); 525 sch_tree_unlock(sch); 526 qdisc_class_hash_grow(sch, &q->clhash); 527 528 *arg = (unsigned long)cl; 529 return 0; 530 531 destroy_class: 532 qdisc_put(cl->qdisc); 533 kfree(cl); 534 return err; 535 } 536 537 static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) 538 { 539 struct qfq_sched *q = qdisc_priv(sch); 540 541 qfq_rm_from_agg(q, cl); 542 gen_kill_estimator(&cl->rate_est); 543 qdisc_put(cl->qdisc); 544 kfree(cl); 545 } 546 547 static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, 548 struct netlink_ext_ack *extack) 549 { 550 struct qfq_sched *q = qdisc_priv(sch); 551 struct qfq_class *cl = (struct qfq_class *)arg; 552 553 if (qdisc_class_in_use(&cl->common)) { 554 NL_SET_ERR_MSG_MOD(extack, "QFQ class in use"); 555 return -EBUSY; 556 } 557 558 sch_tree_lock(sch); 559 560 qdisc_purge_queue(cl->qdisc); 561 qdisc_class_hash_remove(&q->clhash, &cl->common); 562 qfq_destroy_class(sch, cl); 563 564 sch_tree_unlock(sch); 565 566 return 0; 567 } 568 569 static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid) 570 { 571 return (unsigned long)qfq_find_class(sch, classid); 572 } 573 574 static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl, 575 struct netlink_ext_ack *extack) 576 { 577 struct qfq_sched *q = qdisc_priv(sch); 578 579 if (cl) 580 return NULL; 581 582 return q->block; 583 } 584 585 static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, 586 u32 classid) 587 { 588 struct qfq_class *cl = qfq_find_class(sch, classid); 589 590 if (cl) 591 qdisc_class_get(&cl->common); 592 593 return (unsigned long)cl; 594 } 595 596 static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) 597 { 598 struct qfq_class *cl = (struct qfq_class *)arg; 599 600 qdisc_class_put(&cl->common); 601 } 602 603 static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, 604 struct Qdisc *new, struct Qdisc **old, 605 struct netlink_ext_ack *extack) 606 { 607 struct qfq_class *cl = (struct qfq_class *)arg; 608 609 if (new == NULL) { 610 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, 611 cl->common.classid, NULL); 612 if (new == NULL) 613 new = &noop_qdisc; 614 } 615 616 *old = qdisc_replace(sch, new, &cl->qdisc); 617 return 0; 618 } 619 620 static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg) 621 { 622 struct qfq_class *cl = (struct qfq_class *)arg; 623 624 return cl->qdisc; 625 } 626 627 static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, 628 struct sk_buff *skb, struct tcmsg *tcm) 629 { 630 struct qfq_class *cl = (struct qfq_class *)arg; 631 struct nlattr *nest; 632 u32 class_weight, lmax; 633 634 tcm->tcm_parent = TC_H_ROOT; 635 tcm->tcm_handle = cl->common.classid; 636 tcm->tcm_info = cl->qdisc->handle; 637 638 nest = nla_nest_start_noflag(skb, TCA_OPTIONS); 639 if (nest == NULL) 640 goto nla_put_failure; 641 642 sch_tree_lock(sch); 643 class_weight = cl->agg->class_weight; 644 lmax = cl->agg->lmax; 645 sch_tree_unlock(sch); 646 if (nla_put_u32(skb, TCA_QFQ_WEIGHT, class_weight) || 647 nla_put_u32(skb, TCA_QFQ_LMAX, lmax)) 648 goto nla_put_failure; 649 return nla_nest_end(skb, nest); 650 651 nla_put_failure: 652 nla_nest_cancel(skb, nest); 653 return -EMSGSIZE; 654 } 655 656 static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, 657 struct gnet_dump *d) 658 { 659 struct qfq_class *cl = (struct qfq_class *)arg; 660 struct tc_qfq_stats xstats; 661 662 memset(&xstats, 0, sizeof(xstats)); 663 664 sch_tree_lock(sch); 665 xstats.weight = cl->agg->class_weight; 666 xstats.lmax = cl->agg->lmax; 667 sch_tree_unlock(sch); 668 669 if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || 670 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || 671 qdisc_qstats_copy(d, cl->qdisc) < 0) 672 return -1; 673 674 return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); 675 } 676 677 static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) 678 { 679 struct qfq_sched *q = qdisc_priv(sch); 680 struct qfq_class *cl; 681 unsigned int i; 682 683 if (arg->stop) 684 return; 685 686 for (i = 0; i < q->clhash.hashsize; i++) { 687 hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { 688 if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) 689 return; 690 } 691 } 692 } 693 694 static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, 695 int *qerr) 696 { 697 struct qfq_sched *q = qdisc_priv(sch); 698 struct qfq_class *cl; 699 struct tcf_result res; 700 struct tcf_proto *fl; 701 int result; 702 703 if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { 704 pr_debug("qfq_classify: found %d\n", skb->priority); 705 cl = qfq_find_class(sch, skb->priority); 706 if (cl != NULL) 707 return cl; 708 } 709 710 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 711 fl = rcu_dereference_bh(q->filter_list); 712 result = tcf_classify(skb, NULL, fl, &res, false); 713 if (result >= 0) { 714 #ifdef CONFIG_NET_CLS_ACT 715 switch (result) { 716 case TC_ACT_QUEUED: 717 case TC_ACT_STOLEN: 718 case TC_ACT_TRAP: 719 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 720 fallthrough; 721 case TC_ACT_SHOT: 722 return NULL; 723 } 724 #endif 725 cl = (struct qfq_class *)res.class; 726 if (cl == NULL) 727 cl = qfq_find_class(sch, res.classid); 728 return cl; 729 } 730 731 return NULL; 732 } 733 734 /* Generic comparison function, handling wraparound. */ 735 static inline int qfq_gt(u64 a, u64 b) 736 { 737 return (s64)(a - b) > 0; 738 } 739 740 /* Round a precise timestamp to its slotted value. */ 741 static inline u64 qfq_round_down(u64 ts, unsigned int shift) 742 { 743 return ts & ~((1ULL << shift) - 1); 744 } 745 746 /* return the pointer to the group with lowest index in the bitmap */ 747 static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, 748 unsigned long bitmap) 749 { 750 int index = __ffs(bitmap); 751 return &q->groups[index]; 752 } 753 /* Calculate a mask to mimic what would be ffs_from(). */ 754 static inline unsigned long mask_from(unsigned long bitmap, int from) 755 { 756 return bitmap & ~((1UL << from) - 1); 757 } 758 759 /* 760 * The state computation relies on ER=0, IR=1, EB=2, IB=3 761 * First compute eligibility comparing grp->S, q->V, 762 * then check if someone is blocking us and possibly add EB 763 */ 764 static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp) 765 { 766 /* if S > V we are not eligible */ 767 unsigned int state = qfq_gt(grp->S, q->V); 768 unsigned long mask = mask_from(q->bitmaps[ER], grp->index); 769 struct qfq_group *next; 770 771 if (mask) { 772 next = qfq_ffs(q, mask); 773 if (qfq_gt(grp->F, next->F)) 774 state |= EB; 775 } 776 777 return state; 778 } 779 780 781 /* 782 * In principle 783 * q->bitmaps[dst] |= q->bitmaps[src] & mask; 784 * q->bitmaps[src] &= ~mask; 785 * but we should make sure that src != dst 786 */ 787 static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, 788 int src, int dst) 789 { 790 q->bitmaps[dst] |= q->bitmaps[src] & mask; 791 q->bitmaps[src] &= ~mask; 792 } 793 794 static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) 795 { 796 unsigned long mask = mask_from(q->bitmaps[ER], index + 1); 797 struct qfq_group *next; 798 799 if (mask) { 800 next = qfq_ffs(q, mask); 801 if (!qfq_gt(next->F, old_F)) 802 return; 803 } 804 805 mask = (1UL << index) - 1; 806 qfq_move_groups(q, mask, EB, ER); 807 qfq_move_groups(q, mask, IB, IR); 808 } 809 810 /* 811 * perhaps 812 * 813 old_V ^= q->V; 814 old_V >>= q->min_slot_shift; 815 if (old_V) { 816 ... 817 } 818 * 819 */ 820 static void qfq_make_eligible(struct qfq_sched *q) 821 { 822 unsigned long vslot = q->V >> q->min_slot_shift; 823 unsigned long old_vslot = q->oldV >> q->min_slot_shift; 824 825 if (vslot != old_vslot) { 826 unsigned long mask; 827 int last_flip_pos = fls(vslot ^ old_vslot); 828 829 if (last_flip_pos > 31) /* higher than the number of groups */ 830 mask = ~0UL; /* make all groups eligible */ 831 else 832 mask = (1UL << last_flip_pos) - 1; 833 834 qfq_move_groups(q, mask, IR, ER); 835 qfq_move_groups(q, mask, IB, EB); 836 } 837 } 838 839 /* 840 * The index of the slot in which the input aggregate agg is to be 841 * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' 842 * and not a '-1' because the start time of the group may be moved 843 * backward by one slot after the aggregate has been inserted, and 844 * this would cause non-empty slots to be right-shifted by one 845 * position. 846 * 847 * QFQ+ fully satisfies this bound to the slot index if the parameters 848 * of the classes are not changed dynamically, and if QFQ+ never 849 * happens to postpone the service of agg unjustly, i.e., it never 850 * happens that the aggregate becomes backlogged and eligible, or just 851 * eligible, while an aggregate with a higher approximated finish time 852 * is being served. In particular, in this case QFQ+ guarantees that 853 * the timestamps of agg are low enough that the slot index is never 854 * higher than 2. Unfortunately, QFQ+ cannot provide the same 855 * guarantee if it happens to unjustly postpone the service of agg, or 856 * if the parameters of some class are changed. 857 * 858 * As for the first event, i.e., an out-of-order service, the 859 * upper bound to the slot index guaranteed by QFQ+ grows to 860 * 2 + 861 * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * 862 * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1. 863 * 864 * The following function deals with this problem by backward-shifting 865 * the timestamps of agg, if needed, so as to guarantee that the slot 866 * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may 867 * cause the service of other aggregates to be postponed, yet the 868 * worst-case guarantees of these aggregates are not violated. In 869 * fact, in case of no out-of-order service, the timestamps of agg 870 * would have been even lower than they are after the backward shift, 871 * because QFQ+ would have guaranteed a maximum value equal to 2 for 872 * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose 873 * service is postponed because of the backward-shift would have 874 * however waited for the service of agg before being served. 875 * 876 * The other event that may cause the slot index to be higher than 2 877 * for agg is a recent change of the parameters of some class. If the 878 * weight of a class is increased or the lmax (max_pkt_size) of the 879 * class is decreased, then a new aggregate with smaller slot size 880 * than the original parent aggregate of the class may happen to be 881 * activated. The activation of this aggregate should be properly 882 * delayed to when the service of the class has finished in the ideal 883 * system tracked by QFQ+. If the activation of the aggregate is not 884 * delayed to this reference time instant, then this aggregate may be 885 * unjustly served before other aggregates waiting for service. This 886 * may cause the above bound to the slot index to be violated for some 887 * of these unlucky aggregates. 888 * 889 * Instead of delaying the activation of the new aggregate, which is 890 * quite complex, the above-discussed capping of the slot index is 891 * used to handle also the consequences of a change of the parameters 892 * of a class. 893 */ 894 static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, 895 u64 roundedS) 896 { 897 u64 slot = (roundedS - grp->S) >> grp->slot_shift; 898 unsigned int i; /* slot index in the bucket list */ 899 900 if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { 901 u64 deltaS = roundedS - grp->S - 902 ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); 903 agg->S -= deltaS; 904 agg->F -= deltaS; 905 slot = QFQ_MAX_SLOTS - 2; 906 } 907 908 i = (grp->front + slot) % QFQ_MAX_SLOTS; 909 910 hlist_add_head(&agg->next, &grp->slots[i]); 911 __set_bit(slot, &grp->full_slots); 912 } 913 914 /* Maybe introduce hlist_first_entry?? */ 915 static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) 916 { 917 return hlist_entry(grp->slots[grp->front].first, 918 struct qfq_aggregate, next); 919 } 920 921 /* 922 * remove the entry from the slot 923 */ 924 static void qfq_front_slot_remove(struct qfq_group *grp) 925 { 926 struct qfq_aggregate *agg = qfq_slot_head(grp); 927 928 BUG_ON(!agg); 929 hlist_del(&agg->next); 930 if (hlist_empty(&grp->slots[grp->front])) 931 __clear_bit(0, &grp->full_slots); 932 } 933 934 /* 935 * Returns the first aggregate in the first non-empty bucket of the 936 * group. As a side effect, adjusts the bucket list so the first 937 * non-empty bucket is at position 0 in full_slots. 938 */ 939 static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) 940 { 941 unsigned int i; 942 943 pr_debug("qfq slot_scan: grp %u full %#lx\n", 944 grp->index, grp->full_slots); 945 946 if (grp->full_slots == 0) 947 return NULL; 948 949 i = __ffs(grp->full_slots); /* zero based */ 950 if (i > 0) { 951 grp->front = (grp->front + i) % QFQ_MAX_SLOTS; 952 grp->full_slots >>= i; 953 } 954 955 return qfq_slot_head(grp); 956 } 957 958 /* 959 * adjust the bucket list. When the start time of a group decreases, 960 * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to 961 * move the objects. The mask of occupied slots must be shifted 962 * because we use ffs() to find the first non-empty slot. 963 * This covers decreases in the group's start time, but what about 964 * increases of the start time ? 965 * Here too we should make sure that i is less than 32 966 */ 967 static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) 968 { 969 unsigned int i = (grp->S - roundedS) >> grp->slot_shift; 970 971 grp->full_slots <<= i; 972 grp->front = (grp->front - i) % QFQ_MAX_SLOTS; 973 } 974 975 static void qfq_update_eligible(struct qfq_sched *q) 976 { 977 struct qfq_group *grp; 978 unsigned long ineligible; 979 980 ineligible = q->bitmaps[IR] | q->bitmaps[IB]; 981 if (ineligible) { 982 if (!q->bitmaps[ER]) { 983 grp = qfq_ffs(q, ineligible); 984 if (qfq_gt(grp->S, q->V)) 985 q->V = grp->S; 986 } 987 qfq_make_eligible(q); 988 } 989 } 990 991 /* Dequeue head packet of the head class in the DRR queue of the aggregate. */ 992 static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, 993 struct qfq_class *cl, unsigned int len) 994 { 995 struct sk_buff *skb = qdisc_dequeue_peeked(cl->qdisc); 996 997 if (!skb) 998 return NULL; 999 1000 cl->deficit -= (int) len; 1001 1002 if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ 1003 list_del_init(&cl->alist); 1004 else if (cl->deficit < qdisc_peek_len(cl->qdisc)) { 1005 cl->deficit += agg->lmax; 1006 list_move_tail(&cl->alist, &agg->active); 1007 } 1008 1009 return skb; 1010 } 1011 1012 static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, 1013 struct qfq_class **cl, 1014 unsigned int *len) 1015 { 1016 struct sk_buff *skb; 1017 1018 *cl = list_first_entry(&agg->active, struct qfq_class, alist); 1019 skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); 1020 if (skb == NULL) 1021 qdisc_warn_nonwc("qfq_dequeue", (*cl)->qdisc); 1022 else 1023 *len = qdisc_pkt_len(skb); 1024 1025 return skb; 1026 } 1027 1028 /* Update F according to the actual service received by the aggregate. */ 1029 static inline void charge_actual_service(struct qfq_aggregate *agg) 1030 { 1031 /* Compute the service received by the aggregate, taking into 1032 * account that, after decreasing the number of classes in 1033 * agg, it may happen that 1034 * agg->initial_budget - agg->budget > agg->bugdetmax 1035 */ 1036 u32 service_received = min(agg->budgetmax, 1037 agg->initial_budget - agg->budget); 1038 1039 agg->F = agg->S + (u64)service_received * agg->inv_w; 1040 } 1041 1042 /* Assign a reasonable start time for a new aggregate in group i. 1043 * Admissible values for \hat(F) are multiples of \sigma_i 1044 * no greater than V+\sigma_i . Larger values mean that 1045 * we had a wraparound so we consider the timestamp to be stale. 1046 * 1047 * If F is not stale and F >= V then we set S = F. 1048 * Otherwise we should assign S = V, but this may violate 1049 * the ordering in EB (see [2]). So, if we have groups in ER, 1050 * set S to the F_j of the first group j which would be blocking us. 1051 * We are guaranteed not to move S backward because 1052 * otherwise our group i would still be blocked. 1053 */ 1054 static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) 1055 { 1056 unsigned long mask; 1057 u64 limit, roundedF; 1058 int slot_shift = agg->grp->slot_shift; 1059 1060 roundedF = qfq_round_down(agg->F, slot_shift); 1061 limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); 1062 1063 if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { 1064 /* timestamp was stale */ 1065 mask = mask_from(q->bitmaps[ER], agg->grp->index); 1066 if (mask) { 1067 struct qfq_group *next = qfq_ffs(q, mask); 1068 if (qfq_gt(roundedF, next->F)) { 1069 if (qfq_gt(limit, next->F)) 1070 agg->S = next->F; 1071 else /* preserve timestamp correctness */ 1072 agg->S = limit; 1073 return; 1074 } 1075 } 1076 agg->S = q->V; 1077 } else /* timestamp is not stale */ 1078 agg->S = agg->F; 1079 } 1080 1081 /* Update the timestamps of agg before scheduling/rescheduling it for 1082 * service. In particular, assign to agg->F its maximum possible 1083 * value, i.e., the virtual finish time with which the aggregate 1084 * should be labeled if it used all its budget once in service. 1085 */ 1086 static inline void 1087 qfq_update_agg_ts(struct qfq_sched *q, 1088 struct qfq_aggregate *agg, enum update_reason reason) 1089 { 1090 if (reason != requeue) 1091 qfq_update_start(q, agg); 1092 else /* just charge agg for the service received */ 1093 agg->S = agg->F; 1094 1095 agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; 1096 } 1097 1098 static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); 1099 1100 static struct sk_buff *qfq_dequeue(struct Qdisc *sch) 1101 { 1102 struct qfq_sched *q = qdisc_priv(sch); 1103 struct qfq_aggregate *in_serv_agg = q->in_serv_agg; 1104 struct qfq_class *cl; 1105 struct sk_buff *skb = NULL; 1106 /* next-packet len, 0 means no more active classes in in-service agg */ 1107 unsigned int len = 0; 1108 1109 if (in_serv_agg == NULL) 1110 return NULL; 1111 1112 if (!list_empty(&in_serv_agg->active)) 1113 skb = qfq_peek_skb(in_serv_agg, &cl, &len); 1114 1115 /* 1116 * If there are no active classes in the in-service aggregate, 1117 * or if the aggregate has not enough budget to serve its next 1118 * class, then choose the next aggregate to serve. 1119 */ 1120 if (len == 0 || in_serv_agg->budget < len) { 1121 charge_actual_service(in_serv_agg); 1122 1123 /* recharge the budget of the aggregate */ 1124 in_serv_agg->initial_budget = in_serv_agg->budget = 1125 in_serv_agg->budgetmax; 1126 1127 if (!list_empty(&in_serv_agg->active)) { 1128 /* 1129 * Still active: reschedule for 1130 * service. Possible optimization: if no other 1131 * aggregate is active, then there is no point 1132 * in rescheduling this aggregate, and we can 1133 * just keep it as the in-service one. This 1134 * should be however a corner case, and to 1135 * handle it, we would need to maintain an 1136 * extra num_active_aggs field. 1137 */ 1138 qfq_update_agg_ts(q, in_serv_agg, requeue); 1139 qfq_schedule_agg(q, in_serv_agg); 1140 } else if (sch->q.qlen == 0) { /* no aggregate to serve */ 1141 q->in_serv_agg = NULL; 1142 return NULL; 1143 } 1144 1145 /* 1146 * If we get here, there are other aggregates queued: 1147 * choose the new aggregate to serve. 1148 */ 1149 in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); 1150 skb = qfq_peek_skb(in_serv_agg, &cl, &len); 1151 } 1152 if (!skb) 1153 return NULL; 1154 1155 sch->q.qlen--; 1156 1157 skb = agg_dequeue(in_serv_agg, cl, len); 1158 1159 if (!skb) { 1160 sch->q.qlen++; 1161 return NULL; 1162 } 1163 1164 qdisc_qstats_backlog_dec(sch, skb); 1165 qdisc_bstats_update(sch, skb); 1166 1167 /* If lmax is lowered, through qfq_change_class, for a class 1168 * owning pending packets with larger size than the new value 1169 * of lmax, then the following condition may hold. 1170 */ 1171 if (unlikely(in_serv_agg->budget < len)) 1172 in_serv_agg->budget = 0; 1173 else 1174 in_serv_agg->budget -= len; 1175 1176 q->V += (u64)len * q->iwsum; 1177 pr_debug("qfq dequeue: len %u F %lld now %lld\n", 1178 len, (unsigned long long) in_serv_agg->F, 1179 (unsigned long long) q->V); 1180 1181 return skb; 1182 } 1183 1184 static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) 1185 { 1186 struct qfq_group *grp; 1187 struct qfq_aggregate *agg, *new_front_agg; 1188 u64 old_F; 1189 1190 qfq_update_eligible(q); 1191 q->oldV = q->V; 1192 1193 if (!q->bitmaps[ER]) 1194 return NULL; 1195 1196 grp = qfq_ffs(q, q->bitmaps[ER]); 1197 old_F = grp->F; 1198 1199 agg = qfq_slot_head(grp); 1200 1201 /* agg starts to be served, remove it from schedule */ 1202 qfq_front_slot_remove(grp); 1203 1204 new_front_agg = qfq_slot_scan(grp); 1205 1206 if (new_front_agg == NULL) /* group is now inactive, remove from ER */ 1207 __clear_bit(grp->index, &q->bitmaps[ER]); 1208 else { 1209 u64 roundedS = qfq_round_down(new_front_agg->S, 1210 grp->slot_shift); 1211 unsigned int s; 1212 1213 if (grp->S == roundedS) 1214 return agg; 1215 grp->S = roundedS; 1216 grp->F = roundedS + (2ULL << grp->slot_shift); 1217 __clear_bit(grp->index, &q->bitmaps[ER]); 1218 s = qfq_calc_state(q, grp); 1219 __set_bit(grp->index, &q->bitmaps[s]); 1220 } 1221 1222 qfq_unblock_groups(q, grp->index, old_F); 1223 1224 return agg; 1225 } 1226 1227 static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1228 struct sk_buff **to_free) 1229 { 1230 unsigned int len = qdisc_pkt_len(skb), gso_segs; 1231 struct qfq_sched *q = qdisc_priv(sch); 1232 struct qfq_class *cl; 1233 struct qfq_aggregate *agg; 1234 int err = 0; 1235 1236 cl = qfq_classify(skb, sch, &err); 1237 if (cl == NULL) { 1238 if (err & __NET_XMIT_BYPASS) 1239 qdisc_qstats_drop(sch); 1240 __qdisc_drop(skb, to_free); 1241 return err; 1242 } 1243 pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); 1244 1245 if (unlikely(cl->agg->lmax < len)) { 1246 pr_debug("qfq: increasing maxpkt from %u to %u for class %u", 1247 cl->agg->lmax, len, cl->common.classid); 1248 err = qfq_change_agg(sch, cl, cl->agg->class_weight, len); 1249 if (err) { 1250 cl->qstats.drops++; 1251 return qdisc_drop(skb, sch, to_free); 1252 } 1253 } 1254 1255 gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; 1256 err = qdisc_enqueue(skb, cl->qdisc, to_free); 1257 if (unlikely(err != NET_XMIT_SUCCESS)) { 1258 pr_debug("qfq_enqueue: enqueue failed %d\n", err); 1259 if (net_xmit_drop_count(err)) { 1260 cl->qstats.drops++; 1261 qdisc_qstats_drop(sch); 1262 } 1263 return err; 1264 } 1265 1266 _bstats_update(&cl->bstats, len, gso_segs); 1267 sch->qstats.backlog += len; 1268 ++sch->q.qlen; 1269 1270 agg = cl->agg; 1271 /* if the class is active, then done here */ 1272 if (cl_is_active(cl)) { 1273 if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && 1274 list_first_entry(&agg->active, struct qfq_class, alist) 1275 == cl && cl->deficit < len) 1276 list_move_tail(&cl->alist, &agg->active); 1277 1278 return err; 1279 } 1280 1281 /* schedule class for service within the aggregate */ 1282 cl->deficit = agg->lmax; 1283 list_add_tail(&cl->alist, &agg->active); 1284 1285 if (list_first_entry(&agg->active, struct qfq_class, alist) != cl || 1286 q->in_serv_agg == agg) 1287 return err; /* non-empty or in service, nothing else to do */ 1288 1289 qfq_activate_agg(q, agg, enqueue); 1290 1291 return err; 1292 } 1293 1294 /* 1295 * Schedule aggregate according to its timestamps. 1296 */ 1297 static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) 1298 { 1299 struct qfq_group *grp = agg->grp; 1300 u64 roundedS; 1301 int s; 1302 1303 roundedS = qfq_round_down(agg->S, grp->slot_shift); 1304 1305 /* 1306 * Insert agg in the correct bucket. 1307 * If agg->S >= grp->S we don't need to adjust the 1308 * bucket list and simply go to the insertion phase. 1309 * Otherwise grp->S is decreasing, we must make room 1310 * in the bucket list, and also recompute the group state. 1311 * Finally, if there were no flows in this group and nobody 1312 * was in ER make sure to adjust V. 1313 */ 1314 if (grp->full_slots) { 1315 if (!qfq_gt(grp->S, agg->S)) 1316 goto skip_update; 1317 1318 /* create a slot for this agg->S */ 1319 qfq_slot_rotate(grp, roundedS); 1320 /* group was surely ineligible, remove */ 1321 __clear_bit(grp->index, &q->bitmaps[IR]); 1322 __clear_bit(grp->index, &q->bitmaps[IB]); 1323 } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) && 1324 q->in_serv_agg == NULL) 1325 q->V = roundedS; 1326 1327 grp->S = roundedS; 1328 grp->F = roundedS + (2ULL << grp->slot_shift); 1329 s = qfq_calc_state(q, grp); 1330 __set_bit(grp->index, &q->bitmaps[s]); 1331 1332 pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", 1333 s, q->bitmaps[s], 1334 (unsigned long long) agg->S, 1335 (unsigned long long) agg->F, 1336 (unsigned long long) q->V); 1337 1338 skip_update: 1339 qfq_slot_insert(grp, agg, roundedS); 1340 } 1341 1342 1343 /* Update agg ts and schedule agg for service */ 1344 static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, 1345 enum update_reason reason) 1346 { 1347 agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ 1348 1349 qfq_update_agg_ts(q, agg, reason); 1350 if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ 1351 q->in_serv_agg = agg; /* start serving this aggregate */ 1352 /* update V: to be in service, agg must be eligible */ 1353 q->oldV = q->V = agg->S; 1354 } else if (agg != q->in_serv_agg) 1355 qfq_schedule_agg(q, agg); 1356 } 1357 1358 static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, 1359 struct qfq_aggregate *agg) 1360 { 1361 unsigned int i, offset; 1362 u64 roundedS; 1363 1364 roundedS = qfq_round_down(agg->S, grp->slot_shift); 1365 offset = (roundedS - grp->S) >> grp->slot_shift; 1366 1367 i = (grp->front + offset) % QFQ_MAX_SLOTS; 1368 1369 hlist_del(&agg->next); 1370 if (hlist_empty(&grp->slots[i])) 1371 __clear_bit(offset, &grp->full_slots); 1372 } 1373 1374 /* 1375 * Called to forcibly deschedule an aggregate. If the aggregate is 1376 * not in the front bucket, or if the latter has other aggregates in 1377 * the front bucket, we can simply remove the aggregate with no other 1378 * side effects. 1379 * Otherwise we must propagate the event up. 1380 */ 1381 static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) 1382 { 1383 struct qfq_group *grp = agg->grp; 1384 unsigned long mask; 1385 u64 roundedS; 1386 int s; 1387 1388 if (agg == q->in_serv_agg) { 1389 charge_actual_service(agg); 1390 q->in_serv_agg = qfq_choose_next_agg(q); 1391 return; 1392 } 1393 1394 agg->F = agg->S; 1395 qfq_slot_remove(q, grp, agg); 1396 1397 if (!grp->full_slots) { 1398 __clear_bit(grp->index, &q->bitmaps[IR]); 1399 __clear_bit(grp->index, &q->bitmaps[EB]); 1400 __clear_bit(grp->index, &q->bitmaps[IB]); 1401 1402 if (test_bit(grp->index, &q->bitmaps[ER]) && 1403 !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { 1404 mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); 1405 if (mask) 1406 mask = ~((1UL << __fls(mask)) - 1); 1407 else 1408 mask = ~0UL; 1409 qfq_move_groups(q, mask, EB, ER); 1410 qfq_move_groups(q, mask, IB, IR); 1411 } 1412 __clear_bit(grp->index, &q->bitmaps[ER]); 1413 } else if (hlist_empty(&grp->slots[grp->front])) { 1414 agg = qfq_slot_scan(grp); 1415 roundedS = qfq_round_down(agg->S, grp->slot_shift); 1416 if (grp->S != roundedS) { 1417 __clear_bit(grp->index, &q->bitmaps[ER]); 1418 __clear_bit(grp->index, &q->bitmaps[IR]); 1419 __clear_bit(grp->index, &q->bitmaps[EB]); 1420 __clear_bit(grp->index, &q->bitmaps[IB]); 1421 grp->S = roundedS; 1422 grp->F = roundedS + (2ULL << grp->slot_shift); 1423 s = qfq_calc_state(q, grp); 1424 __set_bit(grp->index, &q->bitmaps[s]); 1425 } 1426 } 1427 } 1428 1429 static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) 1430 { 1431 struct qfq_sched *q = qdisc_priv(sch); 1432 struct qfq_class *cl = (struct qfq_class *)arg; 1433 1434 if (list_empty(&cl->alist)) 1435 return; 1436 qfq_deactivate_class(q, cl); 1437 } 1438 1439 static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt, 1440 struct netlink_ext_ack *extack) 1441 { 1442 struct qfq_sched *q = qdisc_priv(sch); 1443 struct qfq_group *grp; 1444 int i, j, err; 1445 u32 max_cl_shift, maxbudg_shift, max_classes; 1446 1447 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 1448 if (err) 1449 return err; 1450 1451 err = qdisc_class_hash_init(&q->clhash); 1452 if (err < 0) 1453 return err; 1454 1455 max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1, 1456 QFQ_MAX_AGG_CLASSES); 1457 /* max_cl_shift = floor(log_2(max_classes)) */ 1458 max_cl_shift = __fls(max_classes); 1459 q->max_agg_classes = 1<<max_cl_shift; 1460 1461 /* maxbudg_shift = log2(max_len * max_classes_per_agg) */ 1462 maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; 1463 q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; 1464 1465 for (i = 0; i <= QFQ_MAX_INDEX; i++) { 1466 grp = &q->groups[i]; 1467 grp->index = i; 1468 grp->slot_shift = q->min_slot_shift + i; 1469 for (j = 0; j < QFQ_MAX_SLOTS; j++) 1470 INIT_HLIST_HEAD(&grp->slots[j]); 1471 } 1472 1473 INIT_HLIST_HEAD(&q->nonfull_aggs); 1474 1475 return 0; 1476 } 1477 1478 static void qfq_reset_qdisc(struct Qdisc *sch) 1479 { 1480 struct qfq_sched *q = qdisc_priv(sch); 1481 struct qfq_class *cl; 1482 unsigned int i; 1483 1484 for (i = 0; i < q->clhash.hashsize; i++) { 1485 hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { 1486 if (cl->qdisc->q.qlen > 0) 1487 qfq_deactivate_class(q, cl); 1488 1489 qdisc_reset(cl->qdisc); 1490 } 1491 } 1492 } 1493 1494 static void qfq_destroy_qdisc(struct Qdisc *sch) 1495 { 1496 struct qfq_sched *q = qdisc_priv(sch); 1497 struct qfq_class *cl; 1498 struct hlist_node *next; 1499 unsigned int i; 1500 1501 tcf_block_put(q->block); 1502 1503 for (i = 0; i < q->clhash.hashsize; i++) { 1504 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], 1505 common.hnode) { 1506 qfq_destroy_class(sch, cl); 1507 } 1508 } 1509 qdisc_class_hash_destroy(&q->clhash); 1510 } 1511 1512 static const struct Qdisc_class_ops qfq_class_ops = { 1513 .change = qfq_change_class, 1514 .delete = qfq_delete_class, 1515 .find = qfq_search_class, 1516 .tcf_block = qfq_tcf_block, 1517 .bind_tcf = qfq_bind_tcf, 1518 .unbind_tcf = qfq_unbind_tcf, 1519 .graft = qfq_graft_class, 1520 .leaf = qfq_class_leaf, 1521 .qlen_notify = qfq_qlen_notify, 1522 .dump = qfq_dump_class, 1523 .dump_stats = qfq_dump_class_stats, 1524 .walk = qfq_walk, 1525 }; 1526 1527 static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { 1528 .cl_ops = &qfq_class_ops, 1529 .id = "qfq", 1530 .priv_size = sizeof(struct qfq_sched), 1531 .enqueue = qfq_enqueue, 1532 .dequeue = qfq_dequeue, 1533 .peek = qdisc_peek_dequeued, 1534 .init = qfq_init_qdisc, 1535 .reset = qfq_reset_qdisc, 1536 .destroy = qfq_destroy_qdisc, 1537 .owner = THIS_MODULE, 1538 }; 1539 MODULE_ALIAS_NET_SCH("qfq"); 1540 1541 static int __init qfq_init(void) 1542 { 1543 return register_qdisc(&qfq_qdisc_ops); 1544 } 1545 1546 static void __exit qfq_exit(void) 1547 { 1548 unregister_qdisc(&qfq_qdisc_ops); 1549 } 1550 1551 module_init(qfq_init); 1552 module_exit(qfq_exit); 1553 MODULE_LICENSE("GPL"); 1554 MODULE_DESCRIPTION("Quick Fair Queueing Plus qdisc"); 1555