1 /* 2 * net/sched/sch_generic.c Generic packet scheduler routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 11 * - Ingress support 12 */ 13 14 #include <linux/bitops.h> 15 #include <linux/module.h> 16 #include <linux/types.h> 17 #include <linux/kernel.h> 18 #include <linux/sched.h> 19 #include <linux/string.h> 20 #include <linux/errno.h> 21 #include <linux/netdevice.h> 22 #include <linux/skbuff.h> 23 #include <linux/rtnetlink.h> 24 #include <linux/init.h> 25 #include <linux/rcupdate.h> 26 #include <linux/list.h> 27 #include <linux/slab.h> 28 #include <linux/if_vlan.h> 29 #include <linux/if_macvlan.h> 30 #include <net/sch_generic.h> 31 #include <net/pkt_sched.h> 32 #include <net/dst.h> 33 #include <trace/events/qdisc.h> 34 35 /* Qdisc to use by default */ 36 const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; 37 EXPORT_SYMBOL(default_qdisc_ops); 38 39 /* Main transmission queue. */ 40 41 /* Modifications to data participating in scheduling must be protected with 42 * qdisc_lock(qdisc) spinlock. 43 * 44 * The idea is the following: 45 * - enqueue, dequeue are serialized via qdisc root lock 46 * - ingress filtering is also serialized via qdisc root lock 47 * - updates to tree and tree walking are only done under the rtnl mutex. 48 */ 49 50 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) 51 { 52 q->gso_skb = skb; 53 q->qstats.requeues++; 54 qdisc_qstats_backlog_inc(q, skb); 55 q->q.qlen++; /* it's still part of the queue */ 56 __netif_schedule(q); 57 58 return 0; 59 } 60 61 static void try_bulk_dequeue_skb(struct Qdisc *q, 62 struct sk_buff *skb, 63 const struct netdev_queue *txq, 64 int *packets) 65 { 66 int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; 67 68 while (bytelimit > 0) { 69 struct sk_buff *nskb = q->dequeue(q); 70 71 if (!nskb) 72 break; 73 74 bytelimit -= nskb->len; /* covers GSO len */ 75 skb->next = nskb; 76 skb = nskb; 77 (*packets)++; /* GSO counts as one pkt */ 78 } 79 skb->next = NULL; 80 } 81 82 /* This variant of try_bulk_dequeue_skb() makes sure 83 * all skbs in the chain are for the same txq 84 */ 85 static void try_bulk_dequeue_skb_slow(struct Qdisc *q, 86 struct sk_buff *skb, 87 int *packets) 88 { 89 int mapping = skb_get_queue_mapping(skb); 90 struct sk_buff *nskb; 91 int cnt = 0; 92 93 do { 94 nskb = q->dequeue(q); 95 if (!nskb) 96 break; 97 if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { 98 q->skb_bad_txq = nskb; 99 qdisc_qstats_backlog_inc(q, nskb); 100 q->q.qlen++; 101 break; 102 } 103 skb->next = nskb; 104 skb = nskb; 105 } while (++cnt < 8); 106 (*packets) += cnt; 107 skb->next = NULL; 108 } 109 110 /* Note that dequeue_skb can possibly return a SKB list (via skb->next). 111 * A requeued skb (via q->gso_skb) can also be a SKB list. 112 */ 113 static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, 114 int *packets) 115 { 116 struct sk_buff *skb = q->gso_skb; 117 const struct netdev_queue *txq = q->dev_queue; 118 119 *packets = 1; 120 if (unlikely(skb)) { 121 /* skb in gso_skb were already validated */ 122 *validate = false; 123 /* check the reason of requeuing without tx lock first */ 124 txq = skb_get_tx_queue(txq->dev, skb); 125 if (!netif_xmit_frozen_or_stopped(txq)) { 126 q->gso_skb = NULL; 127 qdisc_qstats_backlog_dec(q, skb); 128 q->q.qlen--; 129 } else 130 skb = NULL; 131 goto trace; 132 } 133 *validate = true; 134 skb = q->skb_bad_txq; 135 if (unlikely(skb)) { 136 /* check the reason of requeuing without tx lock first */ 137 txq = skb_get_tx_queue(txq->dev, skb); 138 if (!netif_xmit_frozen_or_stopped(txq)) { 139 q->skb_bad_txq = NULL; 140 qdisc_qstats_backlog_dec(q, skb); 141 q->q.qlen--; 142 goto bulk; 143 } 144 skb = NULL; 145 goto trace; 146 } 147 if (!(q->flags & TCQ_F_ONETXQUEUE) || 148 !netif_xmit_frozen_or_stopped(txq)) 149 skb = q->dequeue(q); 150 if (skb) { 151 bulk: 152 if (qdisc_may_bulk(q)) 153 try_bulk_dequeue_skb(q, skb, txq, packets); 154 else 155 try_bulk_dequeue_skb_slow(q, skb, packets); 156 } 157 trace: 158 trace_qdisc_dequeue(q, txq, *packets, skb); 159 return skb; 160 } 161 162 /* 163 * Transmit possibly several skbs, and handle the return status as 164 * required. Owning running seqcount bit guarantees that 165 * only one CPU can execute this function. 166 * 167 * Returns to the caller: 168 * 0 - queue is empty or throttled. 169 * >0 - queue is not empty. 170 */ 171 int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, 172 struct net_device *dev, struct netdev_queue *txq, 173 spinlock_t *root_lock, bool validate) 174 { 175 int ret = NETDEV_TX_BUSY; 176 177 /* And release qdisc */ 178 spin_unlock(root_lock); 179 180 /* Note that we validate skb (GSO, checksum, ...) outside of locks */ 181 if (validate) 182 skb = validate_xmit_skb_list(skb, dev); 183 184 if (likely(skb)) { 185 HARD_TX_LOCK(dev, txq, smp_processor_id()); 186 if (!netif_xmit_frozen_or_stopped(txq)) 187 skb = dev_hard_start_xmit(skb, dev, txq, &ret); 188 189 HARD_TX_UNLOCK(dev, txq); 190 } else { 191 spin_lock(root_lock); 192 return qdisc_qlen(q); 193 } 194 spin_lock(root_lock); 195 196 if (dev_xmit_complete(ret)) { 197 /* Driver sent out skb successfully or skb was consumed */ 198 ret = qdisc_qlen(q); 199 } else { 200 /* Driver returned NETDEV_TX_BUSY - requeue skb */ 201 if (unlikely(ret != NETDEV_TX_BUSY)) 202 net_warn_ratelimited("BUG %s code %d qlen %d\n", 203 dev->name, ret, q->q.qlen); 204 205 ret = dev_requeue_skb(skb, q); 206 } 207 208 if (ret && netif_xmit_frozen_or_stopped(txq)) 209 ret = 0; 210 211 return ret; 212 } 213 214 /* 215 * NOTE: Called under qdisc_lock(q) with locally disabled BH. 216 * 217 * running seqcount guarantees only one CPU can process 218 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for 219 * this queue. 220 * 221 * netif_tx_lock serializes accesses to device driver. 222 * 223 * qdisc_lock(q) and netif_tx_lock are mutually exclusive, 224 * if one is grabbed, another must be free. 225 * 226 * Note, that this procedure can be called by a watchdog timer 227 * 228 * Returns to the caller: 229 * 0 - queue is empty or throttled. 230 * >0 - queue is not empty. 231 * 232 */ 233 static inline int qdisc_restart(struct Qdisc *q, int *packets) 234 { 235 struct netdev_queue *txq; 236 struct net_device *dev; 237 spinlock_t *root_lock; 238 struct sk_buff *skb; 239 bool validate; 240 241 /* Dequeue packet */ 242 skb = dequeue_skb(q, &validate, packets); 243 if (unlikely(!skb)) 244 return 0; 245 246 root_lock = qdisc_lock(q); 247 dev = qdisc_dev(q); 248 txq = skb_get_tx_queue(dev, skb); 249 250 return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); 251 } 252 253 void __qdisc_run(struct Qdisc *q) 254 { 255 int quota = dev_tx_weight; 256 int packets; 257 258 while (qdisc_restart(q, &packets)) { 259 /* 260 * Ordered by possible occurrence: Postpone processing if 261 * 1. we've exceeded packet quota 262 * 2. another process needs the CPU; 263 */ 264 quota -= packets; 265 if (quota <= 0 || need_resched()) { 266 __netif_schedule(q); 267 break; 268 } 269 } 270 271 qdisc_run_end(q); 272 } 273 274 unsigned long dev_trans_start(struct net_device *dev) 275 { 276 unsigned long val, res; 277 unsigned int i; 278 279 if (is_vlan_dev(dev)) 280 dev = vlan_dev_real_dev(dev); 281 else if (netif_is_macvlan(dev)) 282 dev = macvlan_dev_real_dev(dev); 283 res = netdev_get_tx_queue(dev, 0)->trans_start; 284 for (i = 1; i < dev->num_tx_queues; i++) { 285 val = netdev_get_tx_queue(dev, i)->trans_start; 286 if (val && time_after(val, res)) 287 res = val; 288 } 289 290 return res; 291 } 292 EXPORT_SYMBOL(dev_trans_start); 293 294 static void dev_watchdog(struct timer_list *t) 295 { 296 struct net_device *dev = from_timer(dev, t, watchdog_timer); 297 298 netif_tx_lock(dev); 299 if (!qdisc_tx_is_noop(dev)) { 300 if (netif_device_present(dev) && 301 netif_running(dev) && 302 netif_carrier_ok(dev)) { 303 int some_queue_timedout = 0; 304 unsigned int i; 305 unsigned long trans_start; 306 307 for (i = 0; i < dev->num_tx_queues; i++) { 308 struct netdev_queue *txq; 309 310 txq = netdev_get_tx_queue(dev, i); 311 trans_start = txq->trans_start; 312 if (netif_xmit_stopped(txq) && 313 time_after(jiffies, (trans_start + 314 dev->watchdog_timeo))) { 315 some_queue_timedout = 1; 316 txq->trans_timeout++; 317 break; 318 } 319 } 320 321 if (some_queue_timedout) { 322 WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", 323 dev->name, netdev_drivername(dev), i); 324 dev->netdev_ops->ndo_tx_timeout(dev); 325 } 326 if (!mod_timer(&dev->watchdog_timer, 327 round_jiffies(jiffies + 328 dev->watchdog_timeo))) 329 dev_hold(dev); 330 } 331 } 332 netif_tx_unlock(dev); 333 334 dev_put(dev); 335 } 336 337 void __netdev_watchdog_up(struct net_device *dev) 338 { 339 if (dev->netdev_ops->ndo_tx_timeout) { 340 if (dev->watchdog_timeo <= 0) 341 dev->watchdog_timeo = 5*HZ; 342 if (!mod_timer(&dev->watchdog_timer, 343 round_jiffies(jiffies + dev->watchdog_timeo))) 344 dev_hold(dev); 345 } 346 } 347 348 static void dev_watchdog_up(struct net_device *dev) 349 { 350 __netdev_watchdog_up(dev); 351 } 352 353 static void dev_watchdog_down(struct net_device *dev) 354 { 355 netif_tx_lock_bh(dev); 356 if (del_timer(&dev->watchdog_timer)) 357 dev_put(dev); 358 netif_tx_unlock_bh(dev); 359 } 360 361 /** 362 * netif_carrier_on - set carrier 363 * @dev: network device 364 * 365 * Device has detected that carrier. 366 */ 367 void netif_carrier_on(struct net_device *dev) 368 { 369 if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { 370 if (dev->reg_state == NETREG_UNINITIALIZED) 371 return; 372 atomic_inc(&dev->carrier_changes); 373 linkwatch_fire_event(dev); 374 if (netif_running(dev)) 375 __netdev_watchdog_up(dev); 376 } 377 } 378 EXPORT_SYMBOL(netif_carrier_on); 379 380 /** 381 * netif_carrier_off - clear carrier 382 * @dev: network device 383 * 384 * Device has detected loss of carrier. 385 */ 386 void netif_carrier_off(struct net_device *dev) 387 { 388 if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { 389 if (dev->reg_state == NETREG_UNINITIALIZED) 390 return; 391 atomic_inc(&dev->carrier_changes); 392 linkwatch_fire_event(dev); 393 } 394 } 395 EXPORT_SYMBOL(netif_carrier_off); 396 397 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces 398 under all circumstances. It is difficult to invent anything faster or 399 cheaper. 400 */ 401 402 static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, 403 struct sk_buff **to_free) 404 { 405 __qdisc_drop(skb, to_free); 406 return NET_XMIT_CN; 407 } 408 409 static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) 410 { 411 return NULL; 412 } 413 414 struct Qdisc_ops noop_qdisc_ops __read_mostly = { 415 .id = "noop", 416 .priv_size = 0, 417 .enqueue = noop_enqueue, 418 .dequeue = noop_dequeue, 419 .peek = noop_dequeue, 420 .owner = THIS_MODULE, 421 }; 422 423 static struct netdev_queue noop_netdev_queue = { 424 .qdisc = &noop_qdisc, 425 .qdisc_sleeping = &noop_qdisc, 426 }; 427 428 struct Qdisc noop_qdisc = { 429 .enqueue = noop_enqueue, 430 .dequeue = noop_dequeue, 431 .flags = TCQ_F_BUILTIN, 432 .ops = &noop_qdisc_ops, 433 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), 434 .dev_queue = &noop_netdev_queue, 435 .running = SEQCNT_ZERO(noop_qdisc.running), 436 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), 437 }; 438 EXPORT_SYMBOL(noop_qdisc); 439 440 static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt) 441 { 442 /* register_qdisc() assigns a default of noop_enqueue if unset, 443 * but __dev_queue_xmit() treats noqueue only as such 444 * if this is NULL - so clear it here. */ 445 qdisc->enqueue = NULL; 446 return 0; 447 } 448 449 struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { 450 .id = "noqueue", 451 .priv_size = 0, 452 .init = noqueue_init, 453 .enqueue = noop_enqueue, 454 .dequeue = noop_dequeue, 455 .peek = noop_dequeue, 456 .owner = THIS_MODULE, 457 }; 458 459 static const u8 prio2band[TC_PRIO_MAX + 1] = { 460 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 461 }; 462 463 /* 3-band FIFO queue: old style, but should be a bit faster than 464 generic prio+fifo combination. 465 */ 466 467 #define PFIFO_FAST_BANDS 3 468 469 /* 470 * Private data for a pfifo_fast scheduler containing: 471 * - queues for the three band 472 * - bitmap indicating which of the bands contain skbs 473 */ 474 struct pfifo_fast_priv { 475 u32 bitmap; 476 struct qdisc_skb_head q[PFIFO_FAST_BANDS]; 477 }; 478 479 /* 480 * Convert a bitmap to the first band number where an skb is queued, where: 481 * bitmap=0 means there are no skbs on any band. 482 * bitmap=1 means there is an skb on band 0. 483 * bitmap=7 means there are skbs on all 3 bands, etc. 484 */ 485 static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; 486 487 static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv, 488 int band) 489 { 490 return priv->q + band; 491 } 492 493 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, 494 struct sk_buff **to_free) 495 { 496 if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { 497 int band = prio2band[skb->priority & TC_PRIO_MAX]; 498 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 499 struct qdisc_skb_head *list = band2list(priv, band); 500 501 priv->bitmap |= (1 << band); 502 qdisc->q.qlen++; 503 return __qdisc_enqueue_tail(skb, qdisc, list); 504 } 505 506 return qdisc_drop(skb, qdisc, to_free); 507 } 508 509 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) 510 { 511 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 512 int band = bitmap2band[priv->bitmap]; 513 514 if (likely(band >= 0)) { 515 struct qdisc_skb_head *qh = band2list(priv, band); 516 struct sk_buff *skb = __qdisc_dequeue_head(qh); 517 518 if (likely(skb != NULL)) { 519 qdisc_qstats_backlog_dec(qdisc, skb); 520 qdisc_bstats_update(qdisc, skb); 521 } 522 523 qdisc->q.qlen--; 524 if (qh->qlen == 0) 525 priv->bitmap &= ~(1 << band); 526 527 return skb; 528 } 529 530 return NULL; 531 } 532 533 static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) 534 { 535 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 536 int band = bitmap2band[priv->bitmap]; 537 538 if (band >= 0) { 539 struct qdisc_skb_head *qh = band2list(priv, band); 540 541 return qh->head; 542 } 543 544 return NULL; 545 } 546 547 static void pfifo_fast_reset(struct Qdisc *qdisc) 548 { 549 int prio; 550 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 551 552 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) 553 __qdisc_reset_queue(band2list(priv, prio)); 554 555 priv->bitmap = 0; 556 qdisc->qstats.backlog = 0; 557 qdisc->q.qlen = 0; 558 } 559 560 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) 561 { 562 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; 563 564 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1); 565 if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) 566 goto nla_put_failure; 567 return skb->len; 568 569 nla_put_failure: 570 return -1; 571 } 572 573 static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) 574 { 575 int prio; 576 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 577 578 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) 579 qdisc_skb_head_init(band2list(priv, prio)); 580 581 /* Can by-pass the queue discipline */ 582 qdisc->flags |= TCQ_F_CAN_BYPASS; 583 return 0; 584 } 585 586 struct Qdisc_ops pfifo_fast_ops __read_mostly = { 587 .id = "pfifo_fast", 588 .priv_size = sizeof(struct pfifo_fast_priv), 589 .enqueue = pfifo_fast_enqueue, 590 .dequeue = pfifo_fast_dequeue, 591 .peek = pfifo_fast_peek, 592 .init = pfifo_fast_init, 593 .reset = pfifo_fast_reset, 594 .dump = pfifo_fast_dump, 595 .owner = THIS_MODULE, 596 }; 597 EXPORT_SYMBOL(pfifo_fast_ops); 598 599 static struct lock_class_key qdisc_tx_busylock; 600 static struct lock_class_key qdisc_running_key; 601 602 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, 603 const struct Qdisc_ops *ops) 604 { 605 void *p; 606 struct Qdisc *sch; 607 unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size; 608 int err = -ENOBUFS; 609 struct net_device *dev; 610 611 if (!dev_queue) { 612 err = -EINVAL; 613 goto errout; 614 } 615 616 dev = dev_queue->dev; 617 p = kzalloc_node(size, GFP_KERNEL, 618 netdev_queue_numa_node_read(dev_queue)); 619 620 if (!p) 621 goto errout; 622 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); 623 /* if we got non aligned memory, ask more and do alignment ourself */ 624 if (sch != p) { 625 kfree(p); 626 p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL, 627 netdev_queue_numa_node_read(dev_queue)); 628 if (!p) 629 goto errout; 630 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); 631 sch->padded = (char *) sch - (char *) p; 632 } 633 qdisc_skb_head_init(&sch->q); 634 spin_lock_init(&sch->q.lock); 635 636 if (ops->static_flags & TCQ_F_CPUSTATS) { 637 sch->cpu_bstats = 638 netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); 639 if (!sch->cpu_bstats) 640 goto errout1; 641 642 sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); 643 if (!sch->cpu_qstats) { 644 free_percpu(sch->cpu_bstats); 645 goto errout1; 646 } 647 } 648 649 spin_lock_init(&sch->busylock); 650 lockdep_set_class(&sch->busylock, 651 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); 652 653 seqcount_init(&sch->running); 654 lockdep_set_class(&sch->running, 655 dev->qdisc_running_key ?: &qdisc_running_key); 656 657 sch->ops = ops; 658 sch->flags = ops->static_flags; 659 sch->enqueue = ops->enqueue; 660 sch->dequeue = ops->dequeue; 661 sch->dev_queue = dev_queue; 662 dev_hold(dev); 663 refcount_set(&sch->refcnt, 1); 664 665 return sch; 666 errout1: 667 kfree(p); 668 errout: 669 return ERR_PTR(err); 670 } 671 672 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, 673 const struct Qdisc_ops *ops, 674 unsigned int parentid) 675 { 676 struct Qdisc *sch; 677 678 if (!try_module_get(ops->owner)) 679 return NULL; 680 681 sch = qdisc_alloc(dev_queue, ops); 682 if (IS_ERR(sch)) { 683 module_put(ops->owner); 684 return NULL; 685 } 686 sch->parent = parentid; 687 688 if (!ops->init || ops->init(sch, NULL) == 0) 689 return sch; 690 691 qdisc_destroy(sch); 692 return NULL; 693 } 694 EXPORT_SYMBOL(qdisc_create_dflt); 695 696 /* Under qdisc_lock(qdisc) and BH! */ 697 698 void qdisc_reset(struct Qdisc *qdisc) 699 { 700 const struct Qdisc_ops *ops = qdisc->ops; 701 702 if (ops->reset) 703 ops->reset(qdisc); 704 705 kfree_skb(qdisc->skb_bad_txq); 706 qdisc->skb_bad_txq = NULL; 707 708 if (qdisc->gso_skb) { 709 kfree_skb_list(qdisc->gso_skb); 710 qdisc->gso_skb = NULL; 711 } 712 qdisc->q.qlen = 0; 713 qdisc->qstats.backlog = 0; 714 } 715 EXPORT_SYMBOL(qdisc_reset); 716 717 void qdisc_free(struct Qdisc *qdisc) 718 { 719 if (qdisc_is_percpu_stats(qdisc)) { 720 free_percpu(qdisc->cpu_bstats); 721 free_percpu(qdisc->cpu_qstats); 722 } 723 724 kfree((char *) qdisc - qdisc->padded); 725 } 726 727 void qdisc_destroy(struct Qdisc *qdisc) 728 { 729 const struct Qdisc_ops *ops = qdisc->ops; 730 731 if (qdisc->flags & TCQ_F_BUILTIN || 732 !refcount_dec_and_test(&qdisc->refcnt)) 733 return; 734 735 #ifdef CONFIG_NET_SCHED 736 qdisc_hash_del(qdisc); 737 738 qdisc_put_stab(rtnl_dereference(qdisc->stab)); 739 #endif 740 gen_kill_estimator(&qdisc->rate_est); 741 if (ops->reset) 742 ops->reset(qdisc); 743 if (ops->destroy) 744 ops->destroy(qdisc); 745 746 module_put(ops->owner); 747 dev_put(qdisc_dev(qdisc)); 748 749 kfree_skb_list(qdisc->gso_skb); 750 kfree_skb(qdisc->skb_bad_txq); 751 qdisc_free(qdisc); 752 } 753 EXPORT_SYMBOL(qdisc_destroy); 754 755 /* Attach toplevel qdisc to device queue. */ 756 struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, 757 struct Qdisc *qdisc) 758 { 759 struct Qdisc *oqdisc = dev_queue->qdisc_sleeping; 760 spinlock_t *root_lock; 761 762 root_lock = qdisc_lock(oqdisc); 763 spin_lock_bh(root_lock); 764 765 /* Prune old scheduler */ 766 if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1) 767 qdisc_reset(oqdisc); 768 769 /* ... and graft new one */ 770 if (qdisc == NULL) 771 qdisc = &noop_qdisc; 772 dev_queue->qdisc_sleeping = qdisc; 773 rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); 774 775 spin_unlock_bh(root_lock); 776 777 return oqdisc; 778 } 779 EXPORT_SYMBOL(dev_graft_qdisc); 780 781 static void attach_one_default_qdisc(struct net_device *dev, 782 struct netdev_queue *dev_queue, 783 void *_unused) 784 { 785 struct Qdisc *qdisc; 786 const struct Qdisc_ops *ops = default_qdisc_ops; 787 788 if (dev->priv_flags & IFF_NO_QUEUE) 789 ops = &noqueue_qdisc_ops; 790 791 qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT); 792 if (!qdisc) { 793 netdev_info(dev, "activation failed\n"); 794 return; 795 } 796 if (!netif_is_multiqueue(dev)) 797 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 798 dev_queue->qdisc_sleeping = qdisc; 799 } 800 801 static void attach_default_qdiscs(struct net_device *dev) 802 { 803 struct netdev_queue *txq; 804 struct Qdisc *qdisc; 805 806 txq = netdev_get_tx_queue(dev, 0); 807 808 if (!netif_is_multiqueue(dev) || 809 dev->priv_flags & IFF_NO_QUEUE) { 810 netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); 811 dev->qdisc = txq->qdisc_sleeping; 812 qdisc_refcount_inc(dev->qdisc); 813 } else { 814 qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); 815 if (qdisc) { 816 dev->qdisc = qdisc; 817 qdisc->ops->attach(qdisc); 818 } 819 } 820 #ifdef CONFIG_NET_SCHED 821 if (dev->qdisc != &noop_qdisc) 822 qdisc_hash_add(dev->qdisc, false); 823 #endif 824 } 825 826 static void transition_one_qdisc(struct net_device *dev, 827 struct netdev_queue *dev_queue, 828 void *_need_watchdog) 829 { 830 struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping; 831 int *need_watchdog_p = _need_watchdog; 832 833 if (!(new_qdisc->flags & TCQ_F_BUILTIN)) 834 clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); 835 836 rcu_assign_pointer(dev_queue->qdisc, new_qdisc); 837 if (need_watchdog_p) { 838 dev_queue->trans_start = 0; 839 *need_watchdog_p = 1; 840 } 841 } 842 843 void dev_activate(struct net_device *dev) 844 { 845 int need_watchdog; 846 847 /* No queueing discipline is attached to device; 848 * create default one for devices, which need queueing 849 * and noqueue_qdisc for virtual interfaces 850 */ 851 852 if (dev->qdisc == &noop_qdisc) 853 attach_default_qdiscs(dev); 854 855 if (!netif_carrier_ok(dev)) 856 /* Delay activation until next carrier-on event */ 857 return; 858 859 need_watchdog = 0; 860 netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); 861 if (dev_ingress_queue(dev)) 862 transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); 863 864 if (need_watchdog) { 865 netif_trans_update(dev); 866 dev_watchdog_up(dev); 867 } 868 } 869 EXPORT_SYMBOL(dev_activate); 870 871 static void dev_deactivate_queue(struct net_device *dev, 872 struct netdev_queue *dev_queue, 873 void *_qdisc_default) 874 { 875 struct Qdisc *qdisc_default = _qdisc_default; 876 struct Qdisc *qdisc; 877 878 qdisc = rtnl_dereference(dev_queue->qdisc); 879 if (qdisc) { 880 spin_lock_bh(qdisc_lock(qdisc)); 881 882 if (!(qdisc->flags & TCQ_F_BUILTIN)) 883 set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); 884 885 rcu_assign_pointer(dev_queue->qdisc, qdisc_default); 886 qdisc_reset(qdisc); 887 888 spin_unlock_bh(qdisc_lock(qdisc)); 889 } 890 } 891 892 static bool some_qdisc_is_busy(struct net_device *dev) 893 { 894 unsigned int i; 895 896 for (i = 0; i < dev->num_tx_queues; i++) { 897 struct netdev_queue *dev_queue; 898 spinlock_t *root_lock; 899 struct Qdisc *q; 900 int val; 901 902 dev_queue = netdev_get_tx_queue(dev, i); 903 q = dev_queue->qdisc_sleeping; 904 root_lock = qdisc_lock(q); 905 906 spin_lock_bh(root_lock); 907 908 val = (qdisc_is_running(q) || 909 test_bit(__QDISC_STATE_SCHED, &q->state)); 910 911 spin_unlock_bh(root_lock); 912 913 if (val) 914 return true; 915 } 916 return false; 917 } 918 919 /** 920 * dev_deactivate_many - deactivate transmissions on several devices 921 * @head: list of devices to deactivate 922 * 923 * This function returns only when all outstanding transmissions 924 * have completed, unless all devices are in dismantle phase. 925 */ 926 void dev_deactivate_many(struct list_head *head) 927 { 928 struct net_device *dev; 929 bool sync_needed = false; 930 931 list_for_each_entry(dev, head, close_list) { 932 netdev_for_each_tx_queue(dev, dev_deactivate_queue, 933 &noop_qdisc); 934 if (dev_ingress_queue(dev)) 935 dev_deactivate_queue(dev, dev_ingress_queue(dev), 936 &noop_qdisc); 937 938 dev_watchdog_down(dev); 939 sync_needed |= !dev->dismantle; 940 } 941 942 /* Wait for outstanding qdisc-less dev_queue_xmit calls. 943 * This is avoided if all devices are in dismantle phase : 944 * Caller will call synchronize_net() for us 945 */ 946 if (sync_needed) 947 synchronize_net(); 948 949 /* Wait for outstanding qdisc_run calls. */ 950 list_for_each_entry(dev, head, close_list) 951 while (some_qdisc_is_busy(dev)) 952 yield(); 953 } 954 955 void dev_deactivate(struct net_device *dev) 956 { 957 LIST_HEAD(single); 958 959 list_add(&dev->close_list, &single); 960 dev_deactivate_many(&single); 961 list_del(&single); 962 } 963 EXPORT_SYMBOL(dev_deactivate); 964 965 static void dev_init_scheduler_queue(struct net_device *dev, 966 struct netdev_queue *dev_queue, 967 void *_qdisc) 968 { 969 struct Qdisc *qdisc = _qdisc; 970 971 rcu_assign_pointer(dev_queue->qdisc, qdisc); 972 dev_queue->qdisc_sleeping = qdisc; 973 } 974 975 void dev_init_scheduler(struct net_device *dev) 976 { 977 dev->qdisc = &noop_qdisc; 978 netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); 979 if (dev_ingress_queue(dev)) 980 dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); 981 982 timer_setup(&dev->watchdog_timer, dev_watchdog, 0); 983 } 984 985 static void shutdown_scheduler_queue(struct net_device *dev, 986 struct netdev_queue *dev_queue, 987 void *_qdisc_default) 988 { 989 struct Qdisc *qdisc = dev_queue->qdisc_sleeping; 990 struct Qdisc *qdisc_default = _qdisc_default; 991 992 if (qdisc) { 993 rcu_assign_pointer(dev_queue->qdisc, qdisc_default); 994 dev_queue->qdisc_sleeping = qdisc_default; 995 996 qdisc_destroy(qdisc); 997 } 998 } 999 1000 void dev_shutdown(struct net_device *dev) 1001 { 1002 netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); 1003 if (dev_ingress_queue(dev)) 1004 shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); 1005 qdisc_destroy(dev->qdisc); 1006 dev->qdisc = &noop_qdisc; 1007 1008 WARN_ON(timer_pending(&dev->watchdog_timer)); 1009 } 1010 1011 void psched_ratecfg_precompute(struct psched_ratecfg *r, 1012 const struct tc_ratespec *conf, 1013 u64 rate64) 1014 { 1015 memset(r, 0, sizeof(*r)); 1016 r->overhead = conf->overhead; 1017 r->rate_bytes_ps = max_t(u64, conf->rate, rate64); 1018 r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); 1019 r->mult = 1; 1020 /* 1021 * The deal here is to replace a divide by a reciprocal one 1022 * in fast path (a reciprocal divide is a multiply and a shift) 1023 * 1024 * Normal formula would be : 1025 * time_in_ns = (NSEC_PER_SEC * len) / rate_bps 1026 * 1027 * We compute mult/shift to use instead : 1028 * time_in_ns = (len * mult) >> shift; 1029 * 1030 * We try to get the highest possible mult value for accuracy, 1031 * but have to make sure no overflows will ever happen. 1032 */ 1033 if (r->rate_bytes_ps > 0) { 1034 u64 factor = NSEC_PER_SEC; 1035 1036 for (;;) { 1037 r->mult = div64_u64(factor, r->rate_bytes_ps); 1038 if (r->mult & (1U << 31) || factor & (1ULL << 63)) 1039 break; 1040 factor <<= 1; 1041 r->shift++; 1042 } 1043 } 1044 } 1045 EXPORT_SYMBOL(psched_ratecfg_precompute); 1046 1047 static void mini_qdisc_rcu_func(struct rcu_head *head) 1048 { 1049 } 1050 1051 void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, 1052 struct tcf_proto *tp_head) 1053 { 1054 struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq); 1055 struct mini_Qdisc *miniq; 1056 1057 if (!tp_head) { 1058 RCU_INIT_POINTER(*miniqp->p_miniq, NULL); 1059 /* Wait for flying RCU callback before it is freed. */ 1060 rcu_barrier_bh(); 1061 return; 1062 } 1063 1064 miniq = !miniq_old || miniq_old == &miniqp->miniq2 ? 1065 &miniqp->miniq1 : &miniqp->miniq2; 1066 1067 /* We need to make sure that readers won't see the miniq 1068 * we are about to modify. So wait until previous call_rcu_bh callback 1069 * is done. 1070 */ 1071 rcu_barrier_bh(); 1072 miniq->filter_list = tp_head; 1073 rcu_assign_pointer(*miniqp->p_miniq, miniq); 1074 1075 if (miniq_old) 1076 /* This is counterpart of the rcu barriers above. We need to 1077 * block potential new user of miniq_old until all readers 1078 * are not seeing it. 1079 */ 1080 call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func); 1081 } 1082 EXPORT_SYMBOL(mini_qdisc_pair_swap); 1083 1084 void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, 1085 struct mini_Qdisc __rcu **p_miniq) 1086 { 1087 miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; 1088 miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; 1089 miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; 1090 miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; 1091 miniqp->p_miniq = p_miniq; 1092 } 1093 EXPORT_SYMBOL(mini_qdisc_pair_init); 1094