1 /* 2 * Fair Queue CoDel discipline 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> 10 */ 11 12 #include <linux/module.h> 13 #include <linux/types.h> 14 #include <linux/kernel.h> 15 #include <linux/jiffies.h> 16 #include <linux/string.h> 17 #include <linux/in.h> 18 #include <linux/errno.h> 19 #include <linux/init.h> 20 #include <linux/skbuff.h> 21 #include <linux/jhash.h> 22 #include <linux/slab.h> 23 #include <linux/vmalloc.h> 24 #include <net/netlink.h> 25 #include <net/pkt_sched.h> 26 #include <net/flow_keys.h> 27 #include <net/codel.h> 28 29 /* Fair Queue CoDel. 30 * 31 * Principles : 32 * Packets are classified (internal classifier or external) on flows. 33 * This is a Stochastic model (as we use a hash, several flows 34 * might be hashed on same slot) 35 * Each flow has a CoDel managed queue. 36 * Flows are linked onto two (Round Robin) lists, 37 * so that new flows have priority on old ones. 38 * 39 * For a given flow, packets are not reordered (CoDel uses a FIFO) 40 * head drops only. 41 * ECN capability is on by default. 42 * Low memory footprint (64 bytes per flow) 43 */ 44 45 struct fq_codel_flow { 46 struct sk_buff *head; 47 struct sk_buff *tail; 48 struct list_head flowchain; 49 int deficit; 50 u32 dropped; /* number of drops (or ECN marks) on this flow */ 51 struct codel_vars cvars; 52 }; /* please try to keep this structure <= 64 bytes */ 53 54 struct fq_codel_sched_data { 55 struct tcf_proto *filter_list; /* optional external classifier */ 56 struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ 57 u32 *backlogs; /* backlog table [flows_cnt] */ 58 u32 flows_cnt; /* number of flows */ 59 u32 perturbation; /* hash perturbation */ 60 u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ 61 struct codel_params cparams; 62 struct codel_stats cstats; 63 u32 drop_overlimit; 64 u32 new_flow_count; 65 66 struct list_head new_flows; /* list of new flows */ 67 struct list_head old_flows; /* list of old flows */ 68 }; 69 70 static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, 71 const struct sk_buff *skb) 72 { 73 struct flow_keys keys; 74 unsigned int hash; 75 76 skb_flow_dissect(skb, &keys); 77 hash = jhash_3words((__force u32)keys.dst, 78 (__force u32)keys.src ^ keys.ip_proto, 79 (__force u32)keys.ports, q->perturbation); 80 return ((u64)hash * q->flows_cnt) >> 32; 81 } 82 83 static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, 84 int *qerr) 85 { 86 struct fq_codel_sched_data *q = qdisc_priv(sch); 87 struct tcf_result res; 88 int result; 89 90 if (TC_H_MAJ(skb->priority) == sch->handle && 91 TC_H_MIN(skb->priority) > 0 && 92 TC_H_MIN(skb->priority) <= q->flows_cnt) 93 return TC_H_MIN(skb->priority); 94 95 if (!q->filter_list) 96 return fq_codel_hash(q, skb) + 1; 97 98 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 99 result = tc_classify(skb, q->filter_list, &res); 100 if (result >= 0) { 101 #ifdef CONFIG_NET_CLS_ACT 102 switch (result) { 103 case TC_ACT_STOLEN: 104 case TC_ACT_QUEUED: 105 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 106 case TC_ACT_SHOT: 107 return 0; 108 } 109 #endif 110 if (TC_H_MIN(res.classid) <= q->flows_cnt) 111 return TC_H_MIN(res.classid); 112 } 113 return 0; 114 } 115 116 /* helper functions : might be changed when/if skb use a standard list_head */ 117 118 /* remove one skb from head of slot queue */ 119 static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) 120 { 121 struct sk_buff *skb = flow->head; 122 123 flow->head = skb->next; 124 skb->next = NULL; 125 return skb; 126 } 127 128 /* add skb to flow queue (tail add) */ 129 static inline void flow_queue_add(struct fq_codel_flow *flow, 130 struct sk_buff *skb) 131 { 132 if (flow->head == NULL) 133 flow->head = skb; 134 else 135 flow->tail->next = skb; 136 flow->tail = skb; 137 skb->next = NULL; 138 } 139 140 static unsigned int fq_codel_drop(struct Qdisc *sch) 141 { 142 struct fq_codel_sched_data *q = qdisc_priv(sch); 143 struct sk_buff *skb; 144 unsigned int maxbacklog = 0, idx = 0, i, len; 145 struct fq_codel_flow *flow; 146 147 /* Queue is full! Find the fat flow and drop packet from it. 148 * This might sound expensive, but with 1024 flows, we scan 149 * 4KB of memory, and we dont need to handle a complex tree 150 * in fast path (packet queue/enqueue) with many cache misses. 151 */ 152 for (i = 0; i < q->flows_cnt; i++) { 153 if (q->backlogs[i] > maxbacklog) { 154 maxbacklog = q->backlogs[i]; 155 idx = i; 156 } 157 } 158 flow = &q->flows[idx]; 159 skb = dequeue_head(flow); 160 len = qdisc_pkt_len(skb); 161 q->backlogs[idx] -= len; 162 kfree_skb(skb); 163 sch->q.qlen--; 164 sch->qstats.drops++; 165 sch->qstats.backlog -= len; 166 flow->dropped++; 167 return idx; 168 } 169 170 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) 171 { 172 struct fq_codel_sched_data *q = qdisc_priv(sch); 173 unsigned int idx; 174 struct fq_codel_flow *flow; 175 int uninitialized_var(ret); 176 177 idx = fq_codel_classify(skb, sch, &ret); 178 if (idx == 0) { 179 if (ret & __NET_XMIT_BYPASS) 180 sch->qstats.drops++; 181 kfree_skb(skb); 182 return ret; 183 } 184 idx--; 185 186 codel_set_enqueue_time(skb); 187 flow = &q->flows[idx]; 188 flow_queue_add(flow, skb); 189 q->backlogs[idx] += qdisc_pkt_len(skb); 190 sch->qstats.backlog += qdisc_pkt_len(skb); 191 192 if (list_empty(&flow->flowchain)) { 193 list_add_tail(&flow->flowchain, &q->new_flows); 194 q->new_flow_count++; 195 flow->deficit = q->quantum; 196 flow->dropped = 0; 197 } 198 if (++sch->q.qlen <= sch->limit) 199 return NET_XMIT_SUCCESS; 200 201 q->drop_overlimit++; 202 /* Return Congestion Notification only if we dropped a packet 203 * from this flow. 204 */ 205 if (fq_codel_drop(sch) == idx) 206 return NET_XMIT_CN; 207 208 /* As we dropped a packet, better let upper stack know this */ 209 qdisc_tree_decrease_qlen(sch, 1); 210 return NET_XMIT_SUCCESS; 211 } 212 213 /* This is the specific function called from codel_dequeue() 214 * to dequeue a packet from queue. Note: backlog is handled in 215 * codel, we dont need to reduce it here. 216 */ 217 static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) 218 { 219 struct fq_codel_sched_data *q = qdisc_priv(sch); 220 struct fq_codel_flow *flow; 221 struct sk_buff *skb = NULL; 222 223 flow = container_of(vars, struct fq_codel_flow, cvars); 224 if (flow->head) { 225 skb = dequeue_head(flow); 226 q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); 227 sch->q.qlen--; 228 } 229 return skb; 230 } 231 232 static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) 233 { 234 struct fq_codel_sched_data *q = qdisc_priv(sch); 235 struct sk_buff *skb; 236 struct fq_codel_flow *flow; 237 struct list_head *head; 238 u32 prev_drop_count, prev_ecn_mark; 239 240 begin: 241 head = &q->new_flows; 242 if (list_empty(head)) { 243 head = &q->old_flows; 244 if (list_empty(head)) 245 return NULL; 246 } 247 flow = list_first_entry(head, struct fq_codel_flow, flowchain); 248 249 if (flow->deficit <= 0) { 250 flow->deficit += q->quantum; 251 list_move_tail(&flow->flowchain, &q->old_flows); 252 goto begin; 253 } 254 255 prev_drop_count = q->cstats.drop_count; 256 prev_ecn_mark = q->cstats.ecn_mark; 257 258 skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, 259 dequeue); 260 261 flow->dropped += q->cstats.drop_count - prev_drop_count; 262 flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; 263 264 if (!skb) { 265 /* force a pass through old_flows to prevent starvation */ 266 if ((head == &q->new_flows) && !list_empty(&q->old_flows)) 267 list_move_tail(&flow->flowchain, &q->old_flows); 268 else 269 list_del_init(&flow->flowchain); 270 goto begin; 271 } 272 qdisc_bstats_update(sch, skb); 273 flow->deficit -= qdisc_pkt_len(skb); 274 /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, 275 * or HTB crashes. Defer it for next round. 276 */ 277 if (q->cstats.drop_count && sch->q.qlen) { 278 qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); 279 q->cstats.drop_count = 0; 280 } 281 return skb; 282 } 283 284 static void fq_codel_reset(struct Qdisc *sch) 285 { 286 struct sk_buff *skb; 287 288 while ((skb = fq_codel_dequeue(sch)) != NULL) 289 kfree_skb(skb); 290 } 291 292 static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { 293 [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, 294 [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, 295 [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, 296 [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, 297 [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, 298 [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, 299 }; 300 301 static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) 302 { 303 struct fq_codel_sched_data *q = qdisc_priv(sch); 304 struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; 305 int err; 306 307 if (!opt) 308 return -EINVAL; 309 310 err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); 311 if (err < 0) 312 return err; 313 if (tb[TCA_FQ_CODEL_FLOWS]) { 314 if (q->flows) 315 return -EINVAL; 316 q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); 317 if (!q->flows_cnt || 318 q->flows_cnt > 65536) 319 return -EINVAL; 320 } 321 sch_tree_lock(sch); 322 323 if (tb[TCA_FQ_CODEL_TARGET]) { 324 u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); 325 326 q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; 327 } 328 329 if (tb[TCA_FQ_CODEL_INTERVAL]) { 330 u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); 331 332 q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; 333 } 334 335 if (tb[TCA_FQ_CODEL_LIMIT]) 336 sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); 337 338 if (tb[TCA_FQ_CODEL_ECN]) 339 q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); 340 341 if (tb[TCA_FQ_CODEL_QUANTUM]) 342 q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); 343 344 while (sch->q.qlen > sch->limit) { 345 struct sk_buff *skb = fq_codel_dequeue(sch); 346 347 kfree_skb(skb); 348 q->cstats.drop_count++; 349 } 350 qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); 351 q->cstats.drop_count = 0; 352 353 sch_tree_unlock(sch); 354 return 0; 355 } 356 357 static void *fq_codel_zalloc(size_t sz) 358 { 359 void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); 360 361 if (!ptr) 362 ptr = vzalloc(sz); 363 return ptr; 364 } 365 366 static void fq_codel_free(void *addr) 367 { 368 if (addr) { 369 if (is_vmalloc_addr(addr)) 370 vfree(addr); 371 else 372 kfree(addr); 373 } 374 } 375 376 static void fq_codel_destroy(struct Qdisc *sch) 377 { 378 struct fq_codel_sched_data *q = qdisc_priv(sch); 379 380 tcf_destroy_chain(&q->filter_list); 381 fq_codel_free(q->backlogs); 382 fq_codel_free(q->flows); 383 } 384 385 static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) 386 { 387 struct fq_codel_sched_data *q = qdisc_priv(sch); 388 int i; 389 390 sch->limit = 10*1024; 391 q->flows_cnt = 1024; 392 q->quantum = psched_mtu(qdisc_dev(sch)); 393 q->perturbation = prandom_u32(); 394 INIT_LIST_HEAD(&q->new_flows); 395 INIT_LIST_HEAD(&q->old_flows); 396 codel_params_init(&q->cparams); 397 codel_stats_init(&q->cstats); 398 q->cparams.ecn = true; 399 400 if (opt) { 401 int err = fq_codel_change(sch, opt); 402 if (err) 403 return err; 404 } 405 406 if (!q->flows) { 407 q->flows = fq_codel_zalloc(q->flows_cnt * 408 sizeof(struct fq_codel_flow)); 409 if (!q->flows) 410 return -ENOMEM; 411 q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); 412 if (!q->backlogs) { 413 fq_codel_free(q->flows); 414 return -ENOMEM; 415 } 416 for (i = 0; i < q->flows_cnt; i++) { 417 struct fq_codel_flow *flow = q->flows + i; 418 419 INIT_LIST_HEAD(&flow->flowchain); 420 codel_vars_init(&flow->cvars); 421 } 422 } 423 if (sch->limit >= 1) 424 sch->flags |= TCQ_F_CAN_BYPASS; 425 else 426 sch->flags &= ~TCQ_F_CAN_BYPASS; 427 return 0; 428 } 429 430 static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) 431 { 432 struct fq_codel_sched_data *q = qdisc_priv(sch); 433 struct nlattr *opts; 434 435 opts = nla_nest_start(skb, TCA_OPTIONS); 436 if (opts == NULL) 437 goto nla_put_failure; 438 439 if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, 440 codel_time_to_us(q->cparams.target)) || 441 nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, 442 sch->limit) || 443 nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, 444 codel_time_to_us(q->cparams.interval)) || 445 nla_put_u32(skb, TCA_FQ_CODEL_ECN, 446 q->cparams.ecn) || 447 nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, 448 q->quantum) || 449 nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, 450 q->flows_cnt)) 451 goto nla_put_failure; 452 453 nla_nest_end(skb, opts); 454 return skb->len; 455 456 nla_put_failure: 457 return -1; 458 } 459 460 static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 461 { 462 struct fq_codel_sched_data *q = qdisc_priv(sch); 463 struct tc_fq_codel_xstats st = { 464 .type = TCA_FQ_CODEL_XSTATS_QDISC, 465 }; 466 struct list_head *pos; 467 468 st.qdisc_stats.maxpacket = q->cstats.maxpacket; 469 st.qdisc_stats.drop_overlimit = q->drop_overlimit; 470 st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; 471 st.qdisc_stats.new_flow_count = q->new_flow_count; 472 473 list_for_each(pos, &q->new_flows) 474 st.qdisc_stats.new_flows_len++; 475 476 list_for_each(pos, &q->old_flows) 477 st.qdisc_stats.old_flows_len++; 478 479 return gnet_stats_copy_app(d, &st, sizeof(st)); 480 } 481 482 static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) 483 { 484 return NULL; 485 } 486 487 static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) 488 { 489 return 0; 490 } 491 492 static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, 493 u32 classid) 494 { 495 /* we cannot bypass queue discipline anymore */ 496 sch->flags &= ~TCQ_F_CAN_BYPASS; 497 return 0; 498 } 499 500 static void fq_codel_put(struct Qdisc *q, unsigned long cl) 501 { 502 } 503 504 static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) 505 { 506 struct fq_codel_sched_data *q = qdisc_priv(sch); 507 508 if (cl) 509 return NULL; 510 return &q->filter_list; 511 } 512 513 static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, 514 struct sk_buff *skb, struct tcmsg *tcm) 515 { 516 tcm->tcm_handle |= TC_H_MIN(cl); 517 return 0; 518 } 519 520 static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, 521 struct gnet_dump *d) 522 { 523 struct fq_codel_sched_data *q = qdisc_priv(sch); 524 u32 idx = cl - 1; 525 struct gnet_stats_queue qs = { 0 }; 526 struct tc_fq_codel_xstats xstats; 527 528 if (idx < q->flows_cnt) { 529 const struct fq_codel_flow *flow = &q->flows[idx]; 530 const struct sk_buff *skb = flow->head; 531 532 memset(&xstats, 0, sizeof(xstats)); 533 xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; 534 xstats.class_stats.deficit = flow->deficit; 535 xstats.class_stats.ldelay = 536 codel_time_to_us(flow->cvars.ldelay); 537 xstats.class_stats.count = flow->cvars.count; 538 xstats.class_stats.lastcount = flow->cvars.lastcount; 539 xstats.class_stats.dropping = flow->cvars.dropping; 540 if (flow->cvars.dropping) { 541 codel_tdiff_t delta = flow->cvars.drop_next - 542 codel_get_time(); 543 544 xstats.class_stats.drop_next = (delta >= 0) ? 545 codel_time_to_us(delta) : 546 -codel_time_to_us(-delta); 547 } 548 while (skb) { 549 qs.qlen++; 550 skb = skb->next; 551 } 552 qs.backlog = q->backlogs[idx]; 553 qs.drops = flow->dropped; 554 } 555 if (gnet_stats_copy_queue(d, &qs) < 0) 556 return -1; 557 if (idx < q->flows_cnt) 558 return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); 559 return 0; 560 } 561 562 static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) 563 { 564 struct fq_codel_sched_data *q = qdisc_priv(sch); 565 unsigned int i; 566 567 if (arg->stop) 568 return; 569 570 for (i = 0; i < q->flows_cnt; i++) { 571 if (list_empty(&q->flows[i].flowchain) || 572 arg->count < arg->skip) { 573 arg->count++; 574 continue; 575 } 576 if (arg->fn(sch, i + 1, arg) < 0) { 577 arg->stop = 1; 578 break; 579 } 580 arg->count++; 581 } 582 } 583 584 static const struct Qdisc_class_ops fq_codel_class_ops = { 585 .leaf = fq_codel_leaf, 586 .get = fq_codel_get, 587 .put = fq_codel_put, 588 .tcf_chain = fq_codel_find_tcf, 589 .bind_tcf = fq_codel_bind, 590 .unbind_tcf = fq_codel_put, 591 .dump = fq_codel_dump_class, 592 .dump_stats = fq_codel_dump_class_stats, 593 .walk = fq_codel_walk, 594 }; 595 596 static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { 597 .cl_ops = &fq_codel_class_ops, 598 .id = "fq_codel", 599 .priv_size = sizeof(struct fq_codel_sched_data), 600 .enqueue = fq_codel_enqueue, 601 .dequeue = fq_codel_dequeue, 602 .peek = qdisc_peek_dequeued, 603 .drop = fq_codel_drop, 604 .init = fq_codel_init, 605 .reset = fq_codel_reset, 606 .destroy = fq_codel_destroy, 607 .change = fq_codel_change, 608 .dump = fq_codel_dump, 609 .dump_stats = fq_codel_dump_stats, 610 .owner = THIS_MODULE, 611 }; 612 613 static int __init fq_codel_module_init(void) 614 { 615 return register_qdisc(&fq_codel_qdisc_ops); 616 } 617 618 static void __exit fq_codel_module_exit(void) 619 { 620 unregister_qdisc(&fq_codel_qdisc_ops); 621 } 622 623 module_init(fq_codel_module_init) 624 module_exit(fq_codel_module_exit) 625 MODULE_AUTHOR("Eric Dumazet"); 626 MODULE_LICENSE("GPL"); 627