1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <net/net_namespace.h> 98 #include <net/sock.h> 99 #include <linux/rtnetlink.h> 100 #include <linux/stat.h> 101 #include <net/dst.h> 102 #include <net/pkt_sched.h> 103 #include <net/checksum.h> 104 #include <net/xfrm.h> 105 #include <linux/highmem.h> 106 #include <linux/init.h> 107 #include <linux/module.h> 108 #include <linux/netpoll.h> 109 #include <linux/rcupdate.h> 110 #include <linux/delay.h> 111 #include <net/iw_handler.h> 112 #include <asm/current.h> 113 #include <linux/audit.h> 114 #include <linux/dmaengine.h> 115 #include <linux/err.h> 116 #include <linux/ctype.h> 117 #include <linux/if_arp.h> 118 #include <linux/if_vlan.h> 119 #include <linux/ip.h> 120 #include <net/ip.h> 121 #include <net/mpls.h> 122 #include <linux/ipv6.h> 123 #include <linux/in.h> 124 #include <linux/jhash.h> 125 #include <linux/random.h> 126 #include <trace/events/napi.h> 127 #include <trace/events/net.h> 128 #include <trace/events/skb.h> 129 #include <linux/pci.h> 130 #include <linux/inetdevice.h> 131 #include <linux/cpu_rmap.h> 132 #include <linux/static_key.h> 133 #include <linux/hashtable.h> 134 #include <linux/vmalloc.h> 135 #include <linux/if_macvlan.h> 136 #include <linux/errqueue.h> 137 #include <linux/hrtimer.h> 138 #include <linux/netfilter_ingress.h> 139 140 #include "net-sysfs.h" 141 142 /* Instead of increasing this, you should create a hash table. */ 143 #define MAX_GRO_SKBS 8 144 145 /* This should be increased if a protocol with a bigger head is added. */ 146 #define GRO_MAX_HEAD (MAX_HEADER + 128) 147 148 static DEFINE_SPINLOCK(ptype_lock); 149 static DEFINE_SPINLOCK(offload_lock); 150 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 151 struct list_head ptype_all __read_mostly; /* Taps */ 152 static struct list_head offload_base __read_mostly; 153 154 static int netif_rx_internal(struct sk_buff *skb); 155 static int call_netdevice_notifiers_info(unsigned long val, 156 struct net_device *dev, 157 struct netdev_notifier_info *info); 158 159 /* 160 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 161 * semaphore. 162 * 163 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 164 * 165 * Writers must hold the rtnl semaphore while they loop through the 166 * dev_base_head list, and hold dev_base_lock for writing when they do the 167 * actual updates. This allows pure readers to access the list even 168 * while a writer is preparing to update it. 169 * 170 * To put it another way, dev_base_lock is held for writing only to 171 * protect against pure readers; the rtnl semaphore provides the 172 * protection against other writers. 173 * 174 * See, for example usages, register_netdevice() and 175 * unregister_netdevice(), which must be called with the rtnl 176 * semaphore held. 177 */ 178 DEFINE_RWLOCK(dev_base_lock); 179 EXPORT_SYMBOL(dev_base_lock); 180 181 /* protects napi_hash addition/deletion and napi_gen_id */ 182 static DEFINE_SPINLOCK(napi_hash_lock); 183 184 static unsigned int napi_gen_id; 185 static DEFINE_HASHTABLE(napi_hash, 8); 186 187 static seqcount_t devnet_rename_seq; 188 189 static inline void dev_base_seq_inc(struct net *net) 190 { 191 while (++net->dev_base_seq == 0); 192 } 193 194 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 195 { 196 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 197 198 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 199 } 200 201 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 202 { 203 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 204 } 205 206 static inline void rps_lock(struct softnet_data *sd) 207 { 208 #ifdef CONFIG_RPS 209 spin_lock(&sd->input_pkt_queue.lock); 210 #endif 211 } 212 213 static inline void rps_unlock(struct softnet_data *sd) 214 { 215 #ifdef CONFIG_RPS 216 spin_unlock(&sd->input_pkt_queue.lock); 217 #endif 218 } 219 220 /* Device list insertion */ 221 static void list_netdevice(struct net_device *dev) 222 { 223 struct net *net = dev_net(dev); 224 225 ASSERT_RTNL(); 226 227 write_lock_bh(&dev_base_lock); 228 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 229 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 230 hlist_add_head_rcu(&dev->index_hlist, 231 dev_index_hash(net, dev->ifindex)); 232 write_unlock_bh(&dev_base_lock); 233 234 dev_base_seq_inc(net); 235 } 236 237 /* Device list removal 238 * caller must respect a RCU grace period before freeing/reusing dev 239 */ 240 static void unlist_netdevice(struct net_device *dev) 241 { 242 ASSERT_RTNL(); 243 244 /* Unlink dev from the device chain */ 245 write_lock_bh(&dev_base_lock); 246 list_del_rcu(&dev->dev_list); 247 hlist_del_rcu(&dev->name_hlist); 248 hlist_del_rcu(&dev->index_hlist); 249 write_unlock_bh(&dev_base_lock); 250 251 dev_base_seq_inc(dev_net(dev)); 252 } 253 254 /* 255 * Our notifier list 256 */ 257 258 static RAW_NOTIFIER_HEAD(netdev_chain); 259 260 /* 261 * Device drivers call our routines to queue packets here. We empty the 262 * queue in the local softnet handler. 263 */ 264 265 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 266 EXPORT_PER_CPU_SYMBOL(softnet_data); 267 268 #ifdef CONFIG_LOCKDEP 269 /* 270 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 271 * according to dev->type 272 */ 273 static const unsigned short netdev_lock_type[] = 274 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 275 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 276 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 277 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 278 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 279 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 280 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 281 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 282 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 283 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 284 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 285 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 286 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 287 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 288 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 289 290 static const char *const netdev_lock_name[] = 291 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 292 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 293 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 294 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 295 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 296 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 297 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 298 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 299 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 300 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 301 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 302 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 303 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 304 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 305 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 306 307 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 308 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 309 310 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 311 { 312 int i; 313 314 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 315 if (netdev_lock_type[i] == dev_type) 316 return i; 317 /* the last key is used by default */ 318 return ARRAY_SIZE(netdev_lock_type) - 1; 319 } 320 321 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 322 unsigned short dev_type) 323 { 324 int i; 325 326 i = netdev_lock_pos(dev_type); 327 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 328 netdev_lock_name[i]); 329 } 330 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 332 { 333 int i; 334 335 i = netdev_lock_pos(dev->type); 336 lockdep_set_class_and_name(&dev->addr_list_lock, 337 &netdev_addr_lock_key[i], 338 netdev_lock_name[i]); 339 } 340 #else 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 342 unsigned short dev_type) 343 { 344 } 345 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 346 { 347 } 348 #endif 349 350 /******************************************************************************* 351 352 Protocol management and registration routines 353 354 *******************************************************************************/ 355 356 /* 357 * Add a protocol ID to the list. Now that the input handler is 358 * smarter we can dispense with all the messy stuff that used to be 359 * here. 360 * 361 * BEWARE!!! Protocol handlers, mangling input packets, 362 * MUST BE last in hash buckets and checking protocol handlers 363 * MUST start from promiscuous ptype_all chain in net_bh. 364 * It is true now, do not change it. 365 * Explanation follows: if protocol handler, mangling packet, will 366 * be the first on list, it is not able to sense, that packet 367 * is cloned and should be copied-on-write, so that it will 368 * change it and subsequent readers will get broken packet. 369 * --ANK (980803) 370 */ 371 372 static inline struct list_head *ptype_head(const struct packet_type *pt) 373 { 374 if (pt->type == htons(ETH_P_ALL)) 375 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 376 else 377 return pt->dev ? &pt->dev->ptype_specific : 378 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 379 } 380 381 /** 382 * dev_add_pack - add packet handler 383 * @pt: packet type declaration 384 * 385 * Add a protocol handler to the networking stack. The passed &packet_type 386 * is linked into kernel lists and may not be freed until it has been 387 * removed from the kernel lists. 388 * 389 * This call does not sleep therefore it can not 390 * guarantee all CPU's that are in middle of receiving packets 391 * will see the new packet type (until the next received packet). 392 */ 393 394 void dev_add_pack(struct packet_type *pt) 395 { 396 struct list_head *head = ptype_head(pt); 397 398 spin_lock(&ptype_lock); 399 list_add_rcu(&pt->list, head); 400 spin_unlock(&ptype_lock); 401 } 402 EXPORT_SYMBOL(dev_add_pack); 403 404 /** 405 * __dev_remove_pack - remove packet handler 406 * @pt: packet type declaration 407 * 408 * Remove a protocol handler that was previously added to the kernel 409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 410 * from the kernel lists and can be freed or reused once this function 411 * returns. 412 * 413 * The packet type might still be in use by receivers 414 * and must not be freed until after all the CPU's have gone 415 * through a quiescent state. 416 */ 417 void __dev_remove_pack(struct packet_type *pt) 418 { 419 struct list_head *head = ptype_head(pt); 420 struct packet_type *pt1; 421 422 spin_lock(&ptype_lock); 423 424 list_for_each_entry(pt1, head, list) { 425 if (pt == pt1) { 426 list_del_rcu(&pt->list); 427 goto out; 428 } 429 } 430 431 pr_warn("dev_remove_pack: %p not found\n", pt); 432 out: 433 spin_unlock(&ptype_lock); 434 } 435 EXPORT_SYMBOL(__dev_remove_pack); 436 437 /** 438 * dev_remove_pack - remove packet handler 439 * @pt: packet type declaration 440 * 441 * Remove a protocol handler that was previously added to the kernel 442 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 443 * from the kernel lists and can be freed or reused once this function 444 * returns. 445 * 446 * This call sleeps to guarantee that no CPU is looking at the packet 447 * type after return. 448 */ 449 void dev_remove_pack(struct packet_type *pt) 450 { 451 __dev_remove_pack(pt); 452 453 synchronize_net(); 454 } 455 EXPORT_SYMBOL(dev_remove_pack); 456 457 458 /** 459 * dev_add_offload - register offload handlers 460 * @po: protocol offload declaration 461 * 462 * Add protocol offload handlers to the networking stack. The passed 463 * &proto_offload is linked into kernel lists and may not be freed until 464 * it has been removed from the kernel lists. 465 * 466 * This call does not sleep therefore it can not 467 * guarantee all CPU's that are in middle of receiving packets 468 * will see the new offload handlers (until the next received packet). 469 */ 470 void dev_add_offload(struct packet_offload *po) 471 { 472 struct packet_offload *elem; 473 474 spin_lock(&offload_lock); 475 list_for_each_entry(elem, &offload_base, list) { 476 if (po->priority < elem->priority) 477 break; 478 } 479 list_add_rcu(&po->list, elem->list.prev); 480 spin_unlock(&offload_lock); 481 } 482 EXPORT_SYMBOL(dev_add_offload); 483 484 /** 485 * __dev_remove_offload - remove offload handler 486 * @po: packet offload declaration 487 * 488 * Remove a protocol offload handler that was previously added to the 489 * kernel offload handlers by dev_add_offload(). The passed &offload_type 490 * is removed from the kernel lists and can be freed or reused once this 491 * function returns. 492 * 493 * The packet type might still be in use by receivers 494 * and must not be freed until after all the CPU's have gone 495 * through a quiescent state. 496 */ 497 static void __dev_remove_offload(struct packet_offload *po) 498 { 499 struct list_head *head = &offload_base; 500 struct packet_offload *po1; 501 502 spin_lock(&offload_lock); 503 504 list_for_each_entry(po1, head, list) { 505 if (po == po1) { 506 list_del_rcu(&po->list); 507 goto out; 508 } 509 } 510 511 pr_warn("dev_remove_offload: %p not found\n", po); 512 out: 513 spin_unlock(&offload_lock); 514 } 515 516 /** 517 * dev_remove_offload - remove packet offload handler 518 * @po: packet offload declaration 519 * 520 * Remove a packet offload handler that was previously added to the kernel 521 * offload handlers by dev_add_offload(). The passed &offload_type is 522 * removed from the kernel lists and can be freed or reused once this 523 * function returns. 524 * 525 * This call sleeps to guarantee that no CPU is looking at the packet 526 * type after return. 527 */ 528 void dev_remove_offload(struct packet_offload *po) 529 { 530 __dev_remove_offload(po); 531 532 synchronize_net(); 533 } 534 EXPORT_SYMBOL(dev_remove_offload); 535 536 /****************************************************************************** 537 538 Device Boot-time Settings Routines 539 540 *******************************************************************************/ 541 542 /* Boot time configuration table */ 543 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 544 545 /** 546 * netdev_boot_setup_add - add new setup entry 547 * @name: name of the device 548 * @map: configured settings for the device 549 * 550 * Adds new setup entry to the dev_boot_setup list. The function 551 * returns 0 on error and 1 on success. This is a generic routine to 552 * all netdevices. 553 */ 554 static int netdev_boot_setup_add(char *name, struct ifmap *map) 555 { 556 struct netdev_boot_setup *s; 557 int i; 558 559 s = dev_boot_setup; 560 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 561 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 562 memset(s[i].name, 0, sizeof(s[i].name)); 563 strlcpy(s[i].name, name, IFNAMSIZ); 564 memcpy(&s[i].map, map, sizeof(s[i].map)); 565 break; 566 } 567 } 568 569 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 570 } 571 572 /** 573 * netdev_boot_setup_check - check boot time settings 574 * @dev: the netdevice 575 * 576 * Check boot time settings for the device. 577 * The found settings are set for the device to be used 578 * later in the device probing. 579 * Returns 0 if no settings found, 1 if they are. 580 */ 581 int netdev_boot_setup_check(struct net_device *dev) 582 { 583 struct netdev_boot_setup *s = dev_boot_setup; 584 int i; 585 586 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 587 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 588 !strcmp(dev->name, s[i].name)) { 589 dev->irq = s[i].map.irq; 590 dev->base_addr = s[i].map.base_addr; 591 dev->mem_start = s[i].map.mem_start; 592 dev->mem_end = s[i].map.mem_end; 593 return 1; 594 } 595 } 596 return 0; 597 } 598 EXPORT_SYMBOL(netdev_boot_setup_check); 599 600 601 /** 602 * netdev_boot_base - get address from boot time settings 603 * @prefix: prefix for network device 604 * @unit: id for network device 605 * 606 * Check boot time settings for the base address of device. 607 * The found settings are set for the device to be used 608 * later in the device probing. 609 * Returns 0 if no settings found. 610 */ 611 unsigned long netdev_boot_base(const char *prefix, int unit) 612 { 613 const struct netdev_boot_setup *s = dev_boot_setup; 614 char name[IFNAMSIZ]; 615 int i; 616 617 sprintf(name, "%s%d", prefix, unit); 618 619 /* 620 * If device already registered then return base of 1 621 * to indicate not to probe for this interface 622 */ 623 if (__dev_get_by_name(&init_net, name)) 624 return 1; 625 626 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 627 if (!strcmp(name, s[i].name)) 628 return s[i].map.base_addr; 629 return 0; 630 } 631 632 /* 633 * Saves at boot time configured settings for any netdevice. 634 */ 635 int __init netdev_boot_setup(char *str) 636 { 637 int ints[5]; 638 struct ifmap map; 639 640 str = get_options(str, ARRAY_SIZE(ints), ints); 641 if (!str || !*str) 642 return 0; 643 644 /* Save settings */ 645 memset(&map, 0, sizeof(map)); 646 if (ints[0] > 0) 647 map.irq = ints[1]; 648 if (ints[0] > 1) 649 map.base_addr = ints[2]; 650 if (ints[0] > 2) 651 map.mem_start = ints[3]; 652 if (ints[0] > 3) 653 map.mem_end = ints[4]; 654 655 /* Add new entry to the list */ 656 return netdev_boot_setup_add(str, &map); 657 } 658 659 __setup("netdev=", netdev_boot_setup); 660 661 /******************************************************************************* 662 663 Device Interface Subroutines 664 665 *******************************************************************************/ 666 667 /** 668 * dev_get_iflink - get 'iflink' value of a interface 669 * @dev: targeted interface 670 * 671 * Indicates the ifindex the interface is linked to. 672 * Physical interfaces have the same 'ifindex' and 'iflink' values. 673 */ 674 675 int dev_get_iflink(const struct net_device *dev) 676 { 677 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) 678 return dev->netdev_ops->ndo_get_iflink(dev); 679 680 /* If dev->rtnl_link_ops is set, it's a virtual interface. */ 681 if (dev->rtnl_link_ops) 682 return 0; 683 684 return dev->ifindex; 685 } 686 EXPORT_SYMBOL(dev_get_iflink); 687 688 /** 689 * __dev_get_by_name - find a device by its name 690 * @net: the applicable net namespace 691 * @name: name to find 692 * 693 * Find an interface by name. Must be called under RTNL semaphore 694 * or @dev_base_lock. If the name is found a pointer to the device 695 * is returned. If the name is not found then %NULL is returned. The 696 * reference counters are not incremented so the caller must be 697 * careful with locks. 698 */ 699 700 struct net_device *__dev_get_by_name(struct net *net, const char *name) 701 { 702 struct net_device *dev; 703 struct hlist_head *head = dev_name_hash(net, name); 704 705 hlist_for_each_entry(dev, head, name_hlist) 706 if (!strncmp(dev->name, name, IFNAMSIZ)) 707 return dev; 708 709 return NULL; 710 } 711 EXPORT_SYMBOL(__dev_get_by_name); 712 713 /** 714 * dev_get_by_name_rcu - find a device by its name 715 * @net: the applicable net namespace 716 * @name: name to find 717 * 718 * Find an interface by name. 719 * If the name is found a pointer to the device is returned. 720 * If the name is not found then %NULL is returned. 721 * The reference counters are not incremented so the caller must be 722 * careful with locks. The caller must hold RCU lock. 723 */ 724 725 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 726 { 727 struct net_device *dev; 728 struct hlist_head *head = dev_name_hash(net, name); 729 730 hlist_for_each_entry_rcu(dev, head, name_hlist) 731 if (!strncmp(dev->name, name, IFNAMSIZ)) 732 return dev; 733 734 return NULL; 735 } 736 EXPORT_SYMBOL(dev_get_by_name_rcu); 737 738 /** 739 * dev_get_by_name - find a device by its name 740 * @net: the applicable net namespace 741 * @name: name to find 742 * 743 * Find an interface by name. This can be called from any 744 * context and does its own locking. The returned handle has 745 * the usage count incremented and the caller must use dev_put() to 746 * release it when it is no longer needed. %NULL is returned if no 747 * matching device is found. 748 */ 749 750 struct net_device *dev_get_by_name(struct net *net, const char *name) 751 { 752 struct net_device *dev; 753 754 rcu_read_lock(); 755 dev = dev_get_by_name_rcu(net, name); 756 if (dev) 757 dev_hold(dev); 758 rcu_read_unlock(); 759 return dev; 760 } 761 EXPORT_SYMBOL(dev_get_by_name); 762 763 /** 764 * __dev_get_by_index - find a device by its ifindex 765 * @net: the applicable net namespace 766 * @ifindex: index of device 767 * 768 * Search for an interface by index. Returns %NULL if the device 769 * is not found or a pointer to the device. The device has not 770 * had its reference counter increased so the caller must be careful 771 * about locking. The caller must hold either the RTNL semaphore 772 * or @dev_base_lock. 773 */ 774 775 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 776 { 777 struct net_device *dev; 778 struct hlist_head *head = dev_index_hash(net, ifindex); 779 780 hlist_for_each_entry(dev, head, index_hlist) 781 if (dev->ifindex == ifindex) 782 return dev; 783 784 return NULL; 785 } 786 EXPORT_SYMBOL(__dev_get_by_index); 787 788 /** 789 * dev_get_by_index_rcu - find a device by its ifindex 790 * @net: the applicable net namespace 791 * @ifindex: index of device 792 * 793 * Search for an interface by index. Returns %NULL if the device 794 * is not found or a pointer to the device. The device has not 795 * had its reference counter increased so the caller must be careful 796 * about locking. The caller must hold RCU lock. 797 */ 798 799 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 800 { 801 struct net_device *dev; 802 struct hlist_head *head = dev_index_hash(net, ifindex); 803 804 hlist_for_each_entry_rcu(dev, head, index_hlist) 805 if (dev->ifindex == ifindex) 806 return dev; 807 808 return NULL; 809 } 810 EXPORT_SYMBOL(dev_get_by_index_rcu); 811 812 813 /** 814 * dev_get_by_index - find a device by its ifindex 815 * @net: the applicable net namespace 816 * @ifindex: index of device 817 * 818 * Search for an interface by index. Returns NULL if the device 819 * is not found or a pointer to the device. The device returned has 820 * had a reference added and the pointer is safe until the user calls 821 * dev_put to indicate they have finished with it. 822 */ 823 824 struct net_device *dev_get_by_index(struct net *net, int ifindex) 825 { 826 struct net_device *dev; 827 828 rcu_read_lock(); 829 dev = dev_get_by_index_rcu(net, ifindex); 830 if (dev) 831 dev_hold(dev); 832 rcu_read_unlock(); 833 return dev; 834 } 835 EXPORT_SYMBOL(dev_get_by_index); 836 837 /** 838 * netdev_get_name - get a netdevice name, knowing its ifindex. 839 * @net: network namespace 840 * @name: a pointer to the buffer where the name will be stored. 841 * @ifindex: the ifindex of the interface to get the name from. 842 * 843 * The use of raw_seqcount_begin() and cond_resched() before 844 * retrying is required as we want to give the writers a chance 845 * to complete when CONFIG_PREEMPT is not set. 846 */ 847 int netdev_get_name(struct net *net, char *name, int ifindex) 848 { 849 struct net_device *dev; 850 unsigned int seq; 851 852 retry: 853 seq = raw_seqcount_begin(&devnet_rename_seq); 854 rcu_read_lock(); 855 dev = dev_get_by_index_rcu(net, ifindex); 856 if (!dev) { 857 rcu_read_unlock(); 858 return -ENODEV; 859 } 860 861 strcpy(name, dev->name); 862 rcu_read_unlock(); 863 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 864 cond_resched(); 865 goto retry; 866 } 867 868 return 0; 869 } 870 871 /** 872 * dev_getbyhwaddr_rcu - find a device by its hardware address 873 * @net: the applicable net namespace 874 * @type: media type of device 875 * @ha: hardware address 876 * 877 * Search for an interface by MAC address. Returns NULL if the device 878 * is not found or a pointer to the device. 879 * The caller must hold RCU or RTNL. 880 * The returned device has not had its ref count increased 881 * and the caller must therefore be careful about locking 882 * 883 */ 884 885 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 886 const char *ha) 887 { 888 struct net_device *dev; 889 890 for_each_netdev_rcu(net, dev) 891 if (dev->type == type && 892 !memcmp(dev->dev_addr, ha, dev->addr_len)) 893 return dev; 894 895 return NULL; 896 } 897 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 898 899 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 900 { 901 struct net_device *dev; 902 903 ASSERT_RTNL(); 904 for_each_netdev(net, dev) 905 if (dev->type == type) 906 return dev; 907 908 return NULL; 909 } 910 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 911 912 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 913 { 914 struct net_device *dev, *ret = NULL; 915 916 rcu_read_lock(); 917 for_each_netdev_rcu(net, dev) 918 if (dev->type == type) { 919 dev_hold(dev); 920 ret = dev; 921 break; 922 } 923 rcu_read_unlock(); 924 return ret; 925 } 926 EXPORT_SYMBOL(dev_getfirstbyhwtype); 927 928 /** 929 * __dev_get_by_flags - find any device with given flags 930 * @net: the applicable net namespace 931 * @if_flags: IFF_* values 932 * @mask: bitmask of bits in if_flags to check 933 * 934 * Search for any interface with the given flags. Returns NULL if a device 935 * is not found or a pointer to the device. Must be called inside 936 * rtnl_lock(), and result refcount is unchanged. 937 */ 938 939 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 940 unsigned short mask) 941 { 942 struct net_device *dev, *ret; 943 944 ASSERT_RTNL(); 945 946 ret = NULL; 947 for_each_netdev(net, dev) { 948 if (((dev->flags ^ if_flags) & mask) == 0) { 949 ret = dev; 950 break; 951 } 952 } 953 return ret; 954 } 955 EXPORT_SYMBOL(__dev_get_by_flags); 956 957 /** 958 * dev_valid_name - check if name is okay for network device 959 * @name: name string 960 * 961 * Network device names need to be valid file names to 962 * to allow sysfs to work. We also disallow any kind of 963 * whitespace. 964 */ 965 bool dev_valid_name(const char *name) 966 { 967 if (*name == '\0') 968 return false; 969 if (strlen(name) >= IFNAMSIZ) 970 return false; 971 if (!strcmp(name, ".") || !strcmp(name, "..")) 972 return false; 973 974 while (*name) { 975 if (*name == '/' || *name == ':' || isspace(*name)) 976 return false; 977 name++; 978 } 979 return true; 980 } 981 EXPORT_SYMBOL(dev_valid_name); 982 983 /** 984 * __dev_alloc_name - allocate a name for a device 985 * @net: network namespace to allocate the device name in 986 * @name: name format string 987 * @buf: scratch buffer and result name string 988 * 989 * Passed a format string - eg "lt%d" it will try and find a suitable 990 * id. It scans list of devices to build up a free map, then chooses 991 * the first empty slot. The caller must hold the dev_base or rtnl lock 992 * while allocating the name and adding the device in order to avoid 993 * duplicates. 994 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 995 * Returns the number of the unit assigned or a negative errno code. 996 */ 997 998 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 999 { 1000 int i = 0; 1001 const char *p; 1002 const int max_netdevices = 8*PAGE_SIZE; 1003 unsigned long *inuse; 1004 struct net_device *d; 1005 1006 p = strnchr(name, IFNAMSIZ-1, '%'); 1007 if (p) { 1008 /* 1009 * Verify the string as this thing may have come from 1010 * the user. There must be either one "%d" and no other "%" 1011 * characters. 1012 */ 1013 if (p[1] != 'd' || strchr(p + 2, '%')) 1014 return -EINVAL; 1015 1016 /* Use one page as a bit array of possible slots */ 1017 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 1018 if (!inuse) 1019 return -ENOMEM; 1020 1021 for_each_netdev(net, d) { 1022 if (!sscanf(d->name, name, &i)) 1023 continue; 1024 if (i < 0 || i >= max_netdevices) 1025 continue; 1026 1027 /* avoid cases where sscanf is not exact inverse of printf */ 1028 snprintf(buf, IFNAMSIZ, name, i); 1029 if (!strncmp(buf, d->name, IFNAMSIZ)) 1030 set_bit(i, inuse); 1031 } 1032 1033 i = find_first_zero_bit(inuse, max_netdevices); 1034 free_page((unsigned long) inuse); 1035 } 1036 1037 if (buf != name) 1038 snprintf(buf, IFNAMSIZ, name, i); 1039 if (!__dev_get_by_name(net, buf)) 1040 return i; 1041 1042 /* It is possible to run out of possible slots 1043 * when the name is long and there isn't enough space left 1044 * for the digits, or if all bits are used. 1045 */ 1046 return -ENFILE; 1047 } 1048 1049 /** 1050 * dev_alloc_name - allocate a name for a device 1051 * @dev: device 1052 * @name: name format string 1053 * 1054 * Passed a format string - eg "lt%d" it will try and find a suitable 1055 * id. It scans list of devices to build up a free map, then chooses 1056 * the first empty slot. The caller must hold the dev_base or rtnl lock 1057 * while allocating the name and adding the device in order to avoid 1058 * duplicates. 1059 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1060 * Returns the number of the unit assigned or a negative errno code. 1061 */ 1062 1063 int dev_alloc_name(struct net_device *dev, const char *name) 1064 { 1065 char buf[IFNAMSIZ]; 1066 struct net *net; 1067 int ret; 1068 1069 BUG_ON(!dev_net(dev)); 1070 net = dev_net(dev); 1071 ret = __dev_alloc_name(net, name, buf); 1072 if (ret >= 0) 1073 strlcpy(dev->name, buf, IFNAMSIZ); 1074 return ret; 1075 } 1076 EXPORT_SYMBOL(dev_alloc_name); 1077 1078 static int dev_alloc_name_ns(struct net *net, 1079 struct net_device *dev, 1080 const char *name) 1081 { 1082 char buf[IFNAMSIZ]; 1083 int ret; 1084 1085 ret = __dev_alloc_name(net, name, buf); 1086 if (ret >= 0) 1087 strlcpy(dev->name, buf, IFNAMSIZ); 1088 return ret; 1089 } 1090 1091 static int dev_get_valid_name(struct net *net, 1092 struct net_device *dev, 1093 const char *name) 1094 { 1095 BUG_ON(!net); 1096 1097 if (!dev_valid_name(name)) 1098 return -EINVAL; 1099 1100 if (strchr(name, '%')) 1101 return dev_alloc_name_ns(net, dev, name); 1102 else if (__dev_get_by_name(net, name)) 1103 return -EEXIST; 1104 else if (dev->name != name) 1105 strlcpy(dev->name, name, IFNAMSIZ); 1106 1107 return 0; 1108 } 1109 1110 /** 1111 * dev_change_name - change name of a device 1112 * @dev: device 1113 * @newname: name (or format string) must be at least IFNAMSIZ 1114 * 1115 * Change name of a device, can pass format strings "eth%d". 1116 * for wildcarding. 1117 */ 1118 int dev_change_name(struct net_device *dev, const char *newname) 1119 { 1120 unsigned char old_assign_type; 1121 char oldname[IFNAMSIZ]; 1122 int err = 0; 1123 int ret; 1124 struct net *net; 1125 1126 ASSERT_RTNL(); 1127 BUG_ON(!dev_net(dev)); 1128 1129 net = dev_net(dev); 1130 if (dev->flags & IFF_UP) 1131 return -EBUSY; 1132 1133 write_seqcount_begin(&devnet_rename_seq); 1134 1135 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1136 write_seqcount_end(&devnet_rename_seq); 1137 return 0; 1138 } 1139 1140 memcpy(oldname, dev->name, IFNAMSIZ); 1141 1142 err = dev_get_valid_name(net, dev, newname); 1143 if (err < 0) { 1144 write_seqcount_end(&devnet_rename_seq); 1145 return err; 1146 } 1147 1148 if (oldname[0] && !strchr(oldname, '%')) 1149 netdev_info(dev, "renamed from %s\n", oldname); 1150 1151 old_assign_type = dev->name_assign_type; 1152 dev->name_assign_type = NET_NAME_RENAMED; 1153 1154 rollback: 1155 ret = device_rename(&dev->dev, dev->name); 1156 if (ret) { 1157 memcpy(dev->name, oldname, IFNAMSIZ); 1158 dev->name_assign_type = old_assign_type; 1159 write_seqcount_end(&devnet_rename_seq); 1160 return ret; 1161 } 1162 1163 write_seqcount_end(&devnet_rename_seq); 1164 1165 netdev_adjacent_rename_links(dev, oldname); 1166 1167 write_lock_bh(&dev_base_lock); 1168 hlist_del_rcu(&dev->name_hlist); 1169 write_unlock_bh(&dev_base_lock); 1170 1171 synchronize_rcu(); 1172 1173 write_lock_bh(&dev_base_lock); 1174 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1175 write_unlock_bh(&dev_base_lock); 1176 1177 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1178 ret = notifier_to_errno(ret); 1179 1180 if (ret) { 1181 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1182 if (err >= 0) { 1183 err = ret; 1184 write_seqcount_begin(&devnet_rename_seq); 1185 memcpy(dev->name, oldname, IFNAMSIZ); 1186 memcpy(oldname, newname, IFNAMSIZ); 1187 dev->name_assign_type = old_assign_type; 1188 old_assign_type = NET_NAME_RENAMED; 1189 goto rollback; 1190 } else { 1191 pr_err("%s: name change rollback failed: %d\n", 1192 dev->name, ret); 1193 } 1194 } 1195 1196 return err; 1197 } 1198 1199 /** 1200 * dev_set_alias - change ifalias of a device 1201 * @dev: device 1202 * @alias: name up to IFALIASZ 1203 * @len: limit of bytes to copy from info 1204 * 1205 * Set ifalias for a device, 1206 */ 1207 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1208 { 1209 char *new_ifalias; 1210 1211 ASSERT_RTNL(); 1212 1213 if (len >= IFALIASZ) 1214 return -EINVAL; 1215 1216 if (!len) { 1217 kfree(dev->ifalias); 1218 dev->ifalias = NULL; 1219 return 0; 1220 } 1221 1222 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1223 if (!new_ifalias) 1224 return -ENOMEM; 1225 dev->ifalias = new_ifalias; 1226 1227 strlcpy(dev->ifalias, alias, len+1); 1228 return len; 1229 } 1230 1231 1232 /** 1233 * netdev_features_change - device changes features 1234 * @dev: device to cause notification 1235 * 1236 * Called to indicate a device has changed features. 1237 */ 1238 void netdev_features_change(struct net_device *dev) 1239 { 1240 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1241 } 1242 EXPORT_SYMBOL(netdev_features_change); 1243 1244 /** 1245 * netdev_state_change - device changes state 1246 * @dev: device to cause notification 1247 * 1248 * Called to indicate a device has changed state. This function calls 1249 * the notifier chains for netdev_chain and sends a NEWLINK message 1250 * to the routing socket. 1251 */ 1252 void netdev_state_change(struct net_device *dev) 1253 { 1254 if (dev->flags & IFF_UP) { 1255 struct netdev_notifier_change_info change_info; 1256 1257 change_info.flags_changed = 0; 1258 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1259 &change_info.info); 1260 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1261 } 1262 } 1263 EXPORT_SYMBOL(netdev_state_change); 1264 1265 /** 1266 * netdev_notify_peers - notify network peers about existence of @dev 1267 * @dev: network device 1268 * 1269 * Generate traffic such that interested network peers are aware of 1270 * @dev, such as by generating a gratuitous ARP. This may be used when 1271 * a device wants to inform the rest of the network about some sort of 1272 * reconfiguration such as a failover event or virtual machine 1273 * migration. 1274 */ 1275 void netdev_notify_peers(struct net_device *dev) 1276 { 1277 rtnl_lock(); 1278 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1279 rtnl_unlock(); 1280 } 1281 EXPORT_SYMBOL(netdev_notify_peers); 1282 1283 static int __dev_open(struct net_device *dev) 1284 { 1285 const struct net_device_ops *ops = dev->netdev_ops; 1286 int ret; 1287 1288 ASSERT_RTNL(); 1289 1290 if (!netif_device_present(dev)) 1291 return -ENODEV; 1292 1293 /* Block netpoll from trying to do any rx path servicing. 1294 * If we don't do this there is a chance ndo_poll_controller 1295 * or ndo_poll may be running while we open the device 1296 */ 1297 netpoll_poll_disable(dev); 1298 1299 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1300 ret = notifier_to_errno(ret); 1301 if (ret) 1302 return ret; 1303 1304 set_bit(__LINK_STATE_START, &dev->state); 1305 1306 if (ops->ndo_validate_addr) 1307 ret = ops->ndo_validate_addr(dev); 1308 1309 if (!ret && ops->ndo_open) 1310 ret = ops->ndo_open(dev); 1311 1312 netpoll_poll_enable(dev); 1313 1314 if (ret) 1315 clear_bit(__LINK_STATE_START, &dev->state); 1316 else { 1317 dev->flags |= IFF_UP; 1318 dev_set_rx_mode(dev); 1319 dev_activate(dev); 1320 add_device_randomness(dev->dev_addr, dev->addr_len); 1321 } 1322 1323 return ret; 1324 } 1325 1326 /** 1327 * dev_open - prepare an interface for use. 1328 * @dev: device to open 1329 * 1330 * Takes a device from down to up state. The device's private open 1331 * function is invoked and then the multicast lists are loaded. Finally 1332 * the device is moved into the up state and a %NETDEV_UP message is 1333 * sent to the netdev notifier chain. 1334 * 1335 * Calling this function on an active interface is a nop. On a failure 1336 * a negative errno code is returned. 1337 */ 1338 int dev_open(struct net_device *dev) 1339 { 1340 int ret; 1341 1342 if (dev->flags & IFF_UP) 1343 return 0; 1344 1345 ret = __dev_open(dev); 1346 if (ret < 0) 1347 return ret; 1348 1349 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1350 call_netdevice_notifiers(NETDEV_UP, dev); 1351 1352 return ret; 1353 } 1354 EXPORT_SYMBOL(dev_open); 1355 1356 static int __dev_close_many(struct list_head *head) 1357 { 1358 struct net_device *dev; 1359 1360 ASSERT_RTNL(); 1361 might_sleep(); 1362 1363 list_for_each_entry(dev, head, close_list) { 1364 /* Temporarily disable netpoll until the interface is down */ 1365 netpoll_poll_disable(dev); 1366 1367 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1368 1369 clear_bit(__LINK_STATE_START, &dev->state); 1370 1371 /* Synchronize to scheduled poll. We cannot touch poll list, it 1372 * can be even on different cpu. So just clear netif_running(). 1373 * 1374 * dev->stop() will invoke napi_disable() on all of it's 1375 * napi_struct instances on this device. 1376 */ 1377 smp_mb__after_atomic(); /* Commit netif_running(). */ 1378 } 1379 1380 dev_deactivate_many(head); 1381 1382 list_for_each_entry(dev, head, close_list) { 1383 const struct net_device_ops *ops = dev->netdev_ops; 1384 1385 /* 1386 * Call the device specific close. This cannot fail. 1387 * Only if device is UP 1388 * 1389 * We allow it to be called even after a DETACH hot-plug 1390 * event. 1391 */ 1392 if (ops->ndo_stop) 1393 ops->ndo_stop(dev); 1394 1395 dev->flags &= ~IFF_UP; 1396 netpoll_poll_enable(dev); 1397 } 1398 1399 return 0; 1400 } 1401 1402 static int __dev_close(struct net_device *dev) 1403 { 1404 int retval; 1405 LIST_HEAD(single); 1406 1407 list_add(&dev->close_list, &single); 1408 retval = __dev_close_many(&single); 1409 list_del(&single); 1410 1411 return retval; 1412 } 1413 1414 int dev_close_many(struct list_head *head, bool unlink) 1415 { 1416 struct net_device *dev, *tmp; 1417 1418 /* Remove the devices that don't need to be closed */ 1419 list_for_each_entry_safe(dev, tmp, head, close_list) 1420 if (!(dev->flags & IFF_UP)) 1421 list_del_init(&dev->close_list); 1422 1423 __dev_close_many(head); 1424 1425 list_for_each_entry_safe(dev, tmp, head, close_list) { 1426 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1427 call_netdevice_notifiers(NETDEV_DOWN, dev); 1428 if (unlink) 1429 list_del_init(&dev->close_list); 1430 } 1431 1432 return 0; 1433 } 1434 EXPORT_SYMBOL(dev_close_many); 1435 1436 /** 1437 * dev_close - shutdown an interface. 1438 * @dev: device to shutdown 1439 * 1440 * This function moves an active device into down state. A 1441 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1442 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1443 * chain. 1444 */ 1445 int dev_close(struct net_device *dev) 1446 { 1447 if (dev->flags & IFF_UP) { 1448 LIST_HEAD(single); 1449 1450 list_add(&dev->close_list, &single); 1451 dev_close_many(&single, true); 1452 list_del(&single); 1453 } 1454 return 0; 1455 } 1456 EXPORT_SYMBOL(dev_close); 1457 1458 1459 /** 1460 * dev_disable_lro - disable Large Receive Offload on a device 1461 * @dev: device 1462 * 1463 * Disable Large Receive Offload (LRO) on a net device. Must be 1464 * called under RTNL. This is needed if received packets may be 1465 * forwarded to another interface. 1466 */ 1467 void dev_disable_lro(struct net_device *dev) 1468 { 1469 struct net_device *lower_dev; 1470 struct list_head *iter; 1471 1472 dev->wanted_features &= ~NETIF_F_LRO; 1473 netdev_update_features(dev); 1474 1475 if (unlikely(dev->features & NETIF_F_LRO)) 1476 netdev_WARN(dev, "failed to disable LRO!\n"); 1477 1478 netdev_for_each_lower_dev(dev, lower_dev, iter) 1479 dev_disable_lro(lower_dev); 1480 } 1481 EXPORT_SYMBOL(dev_disable_lro); 1482 1483 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1484 struct net_device *dev) 1485 { 1486 struct netdev_notifier_info info; 1487 1488 netdev_notifier_info_init(&info, dev); 1489 return nb->notifier_call(nb, val, &info); 1490 } 1491 1492 static int dev_boot_phase = 1; 1493 1494 /** 1495 * register_netdevice_notifier - register a network notifier block 1496 * @nb: notifier 1497 * 1498 * Register a notifier to be called when network device events occur. 1499 * The notifier passed is linked into the kernel structures and must 1500 * not be reused until it has been unregistered. A negative errno code 1501 * is returned on a failure. 1502 * 1503 * When registered all registration and up events are replayed 1504 * to the new notifier to allow device to have a race free 1505 * view of the network device list. 1506 */ 1507 1508 int register_netdevice_notifier(struct notifier_block *nb) 1509 { 1510 struct net_device *dev; 1511 struct net_device *last; 1512 struct net *net; 1513 int err; 1514 1515 rtnl_lock(); 1516 err = raw_notifier_chain_register(&netdev_chain, nb); 1517 if (err) 1518 goto unlock; 1519 if (dev_boot_phase) 1520 goto unlock; 1521 for_each_net(net) { 1522 for_each_netdev(net, dev) { 1523 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1524 err = notifier_to_errno(err); 1525 if (err) 1526 goto rollback; 1527 1528 if (!(dev->flags & IFF_UP)) 1529 continue; 1530 1531 call_netdevice_notifier(nb, NETDEV_UP, dev); 1532 } 1533 } 1534 1535 unlock: 1536 rtnl_unlock(); 1537 return err; 1538 1539 rollback: 1540 last = dev; 1541 for_each_net(net) { 1542 for_each_netdev(net, dev) { 1543 if (dev == last) 1544 goto outroll; 1545 1546 if (dev->flags & IFF_UP) { 1547 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1548 dev); 1549 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1550 } 1551 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1552 } 1553 } 1554 1555 outroll: 1556 raw_notifier_chain_unregister(&netdev_chain, nb); 1557 goto unlock; 1558 } 1559 EXPORT_SYMBOL(register_netdevice_notifier); 1560 1561 /** 1562 * unregister_netdevice_notifier - unregister a network notifier block 1563 * @nb: notifier 1564 * 1565 * Unregister a notifier previously registered by 1566 * register_netdevice_notifier(). The notifier is unlinked into the 1567 * kernel structures and may then be reused. A negative errno code 1568 * is returned on a failure. 1569 * 1570 * After unregistering unregister and down device events are synthesized 1571 * for all devices on the device list to the removed notifier to remove 1572 * the need for special case cleanup code. 1573 */ 1574 1575 int unregister_netdevice_notifier(struct notifier_block *nb) 1576 { 1577 struct net_device *dev; 1578 struct net *net; 1579 int err; 1580 1581 rtnl_lock(); 1582 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1583 if (err) 1584 goto unlock; 1585 1586 for_each_net(net) { 1587 for_each_netdev(net, dev) { 1588 if (dev->flags & IFF_UP) { 1589 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1590 dev); 1591 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1592 } 1593 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1594 } 1595 } 1596 unlock: 1597 rtnl_unlock(); 1598 return err; 1599 } 1600 EXPORT_SYMBOL(unregister_netdevice_notifier); 1601 1602 /** 1603 * call_netdevice_notifiers_info - call all network notifier blocks 1604 * @val: value passed unmodified to notifier function 1605 * @dev: net_device pointer passed unmodified to notifier function 1606 * @info: notifier information data 1607 * 1608 * Call all network notifier blocks. Parameters and return value 1609 * are as for raw_notifier_call_chain(). 1610 */ 1611 1612 static int call_netdevice_notifiers_info(unsigned long val, 1613 struct net_device *dev, 1614 struct netdev_notifier_info *info) 1615 { 1616 ASSERT_RTNL(); 1617 netdev_notifier_info_init(info, dev); 1618 return raw_notifier_call_chain(&netdev_chain, val, info); 1619 } 1620 1621 /** 1622 * call_netdevice_notifiers - call all network notifier blocks 1623 * @val: value passed unmodified to notifier function 1624 * @dev: net_device pointer passed unmodified to notifier function 1625 * 1626 * Call all network notifier blocks. Parameters and return value 1627 * are as for raw_notifier_call_chain(). 1628 */ 1629 1630 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1631 { 1632 struct netdev_notifier_info info; 1633 1634 return call_netdevice_notifiers_info(val, dev, &info); 1635 } 1636 EXPORT_SYMBOL(call_netdevice_notifiers); 1637 1638 #ifdef CONFIG_NET_INGRESS 1639 static struct static_key ingress_needed __read_mostly; 1640 1641 void net_inc_ingress_queue(void) 1642 { 1643 static_key_slow_inc(&ingress_needed); 1644 } 1645 EXPORT_SYMBOL_GPL(net_inc_ingress_queue); 1646 1647 void net_dec_ingress_queue(void) 1648 { 1649 static_key_slow_dec(&ingress_needed); 1650 } 1651 EXPORT_SYMBOL_GPL(net_dec_ingress_queue); 1652 #endif 1653 1654 static struct static_key netstamp_needed __read_mostly; 1655 #ifdef HAVE_JUMP_LABEL 1656 /* We are not allowed to call static_key_slow_dec() from irq context 1657 * If net_disable_timestamp() is called from irq context, defer the 1658 * static_key_slow_dec() calls. 1659 */ 1660 static atomic_t netstamp_needed_deferred; 1661 #endif 1662 1663 void net_enable_timestamp(void) 1664 { 1665 #ifdef HAVE_JUMP_LABEL 1666 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1667 1668 if (deferred) { 1669 while (--deferred) 1670 static_key_slow_dec(&netstamp_needed); 1671 return; 1672 } 1673 #endif 1674 static_key_slow_inc(&netstamp_needed); 1675 } 1676 EXPORT_SYMBOL(net_enable_timestamp); 1677 1678 void net_disable_timestamp(void) 1679 { 1680 #ifdef HAVE_JUMP_LABEL 1681 if (in_interrupt()) { 1682 atomic_inc(&netstamp_needed_deferred); 1683 return; 1684 } 1685 #endif 1686 static_key_slow_dec(&netstamp_needed); 1687 } 1688 EXPORT_SYMBOL(net_disable_timestamp); 1689 1690 static inline void net_timestamp_set(struct sk_buff *skb) 1691 { 1692 skb->tstamp.tv64 = 0; 1693 if (static_key_false(&netstamp_needed)) 1694 __net_timestamp(skb); 1695 } 1696 1697 #define net_timestamp_check(COND, SKB) \ 1698 if (static_key_false(&netstamp_needed)) { \ 1699 if ((COND) && !(SKB)->tstamp.tv64) \ 1700 __net_timestamp(SKB); \ 1701 } \ 1702 1703 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) 1704 { 1705 unsigned int len; 1706 1707 if (!(dev->flags & IFF_UP)) 1708 return false; 1709 1710 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1711 if (skb->len <= len) 1712 return true; 1713 1714 /* if TSO is enabled, we don't care about the length as the packet 1715 * could be forwarded without being segmented before 1716 */ 1717 if (skb_is_gso(skb)) 1718 return true; 1719 1720 return false; 1721 } 1722 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1723 1724 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1725 { 1726 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 1727 if (skb_copy_ubufs(skb, GFP_ATOMIC)) { 1728 atomic_long_inc(&dev->rx_dropped); 1729 kfree_skb(skb); 1730 return NET_RX_DROP; 1731 } 1732 } 1733 1734 if (unlikely(!is_skb_forwardable(dev, skb))) { 1735 atomic_long_inc(&dev->rx_dropped); 1736 kfree_skb(skb); 1737 return NET_RX_DROP; 1738 } 1739 1740 skb_scrub_packet(skb, true); 1741 skb->priority = 0; 1742 skb->protocol = eth_type_trans(skb, dev); 1743 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1744 1745 return 0; 1746 } 1747 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1748 1749 /** 1750 * dev_forward_skb - loopback an skb to another netif 1751 * 1752 * @dev: destination network device 1753 * @skb: buffer to forward 1754 * 1755 * return values: 1756 * NET_RX_SUCCESS (no congestion) 1757 * NET_RX_DROP (packet was dropped, but freed) 1758 * 1759 * dev_forward_skb can be used for injecting an skb from the 1760 * start_xmit function of one device into the receive queue 1761 * of another device. 1762 * 1763 * The receiving device may be in another namespace, so 1764 * we have to clear all information in the skb that could 1765 * impact namespace isolation. 1766 */ 1767 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1768 { 1769 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1770 } 1771 EXPORT_SYMBOL_GPL(dev_forward_skb); 1772 1773 static inline int deliver_skb(struct sk_buff *skb, 1774 struct packet_type *pt_prev, 1775 struct net_device *orig_dev) 1776 { 1777 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1778 return -ENOMEM; 1779 atomic_inc(&skb->users); 1780 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1781 } 1782 1783 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1784 struct packet_type **pt, 1785 struct net_device *orig_dev, 1786 __be16 type, 1787 struct list_head *ptype_list) 1788 { 1789 struct packet_type *ptype, *pt_prev = *pt; 1790 1791 list_for_each_entry_rcu(ptype, ptype_list, list) { 1792 if (ptype->type != type) 1793 continue; 1794 if (pt_prev) 1795 deliver_skb(skb, pt_prev, orig_dev); 1796 pt_prev = ptype; 1797 } 1798 *pt = pt_prev; 1799 } 1800 1801 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1802 { 1803 if (!ptype->af_packet_priv || !skb->sk) 1804 return false; 1805 1806 if (ptype->id_match) 1807 return ptype->id_match(ptype, skb->sk); 1808 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1809 return true; 1810 1811 return false; 1812 } 1813 1814 /* 1815 * Support routine. Sends outgoing frames to any network 1816 * taps currently in use. 1817 */ 1818 1819 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1820 { 1821 struct packet_type *ptype; 1822 struct sk_buff *skb2 = NULL; 1823 struct packet_type *pt_prev = NULL; 1824 struct list_head *ptype_list = &ptype_all; 1825 1826 rcu_read_lock(); 1827 again: 1828 list_for_each_entry_rcu(ptype, ptype_list, list) { 1829 /* Never send packets back to the socket 1830 * they originated from - MvS (miquels@drinkel.ow.org) 1831 */ 1832 if (skb_loop_sk(ptype, skb)) 1833 continue; 1834 1835 if (pt_prev) { 1836 deliver_skb(skb2, pt_prev, skb->dev); 1837 pt_prev = ptype; 1838 continue; 1839 } 1840 1841 /* need to clone skb, done only once */ 1842 skb2 = skb_clone(skb, GFP_ATOMIC); 1843 if (!skb2) 1844 goto out_unlock; 1845 1846 net_timestamp_set(skb2); 1847 1848 /* skb->nh should be correctly 1849 * set by sender, so that the second statement is 1850 * just protection against buggy protocols. 1851 */ 1852 skb_reset_mac_header(skb2); 1853 1854 if (skb_network_header(skb2) < skb2->data || 1855 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1856 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1857 ntohs(skb2->protocol), 1858 dev->name); 1859 skb_reset_network_header(skb2); 1860 } 1861 1862 skb2->transport_header = skb2->network_header; 1863 skb2->pkt_type = PACKET_OUTGOING; 1864 pt_prev = ptype; 1865 } 1866 1867 if (ptype_list == &ptype_all) { 1868 ptype_list = &dev->ptype_all; 1869 goto again; 1870 } 1871 out_unlock: 1872 if (pt_prev) 1873 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1874 rcu_read_unlock(); 1875 } 1876 1877 /** 1878 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1879 * @dev: Network device 1880 * @txq: number of queues available 1881 * 1882 * If real_num_tx_queues is changed the tc mappings may no longer be 1883 * valid. To resolve this verify the tc mapping remains valid and if 1884 * not NULL the mapping. With no priorities mapping to this 1885 * offset/count pair it will no longer be used. In the worst case TC0 1886 * is invalid nothing can be done so disable priority mappings. If is 1887 * expected that drivers will fix this mapping if they can before 1888 * calling netif_set_real_num_tx_queues. 1889 */ 1890 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1891 { 1892 int i; 1893 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1894 1895 /* If TC0 is invalidated disable TC mapping */ 1896 if (tc->offset + tc->count > txq) { 1897 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1898 dev->num_tc = 0; 1899 return; 1900 } 1901 1902 /* Invalidated prio to tc mappings set to TC0 */ 1903 for (i = 1; i < TC_BITMASK + 1; i++) { 1904 int q = netdev_get_prio_tc_map(dev, i); 1905 1906 tc = &dev->tc_to_txq[q]; 1907 if (tc->offset + tc->count > txq) { 1908 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1909 i, q); 1910 netdev_set_prio_tc_map(dev, i, 0); 1911 } 1912 } 1913 } 1914 1915 #ifdef CONFIG_XPS 1916 static DEFINE_MUTEX(xps_map_mutex); 1917 #define xmap_dereference(P) \ 1918 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1919 1920 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1921 int cpu, u16 index) 1922 { 1923 struct xps_map *map = NULL; 1924 int pos; 1925 1926 if (dev_maps) 1927 map = xmap_dereference(dev_maps->cpu_map[cpu]); 1928 1929 for (pos = 0; map && pos < map->len; pos++) { 1930 if (map->queues[pos] == index) { 1931 if (map->len > 1) { 1932 map->queues[pos] = map->queues[--map->len]; 1933 } else { 1934 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 1935 kfree_rcu(map, rcu); 1936 map = NULL; 1937 } 1938 break; 1939 } 1940 } 1941 1942 return map; 1943 } 1944 1945 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 1946 { 1947 struct xps_dev_maps *dev_maps; 1948 int cpu, i; 1949 bool active = false; 1950 1951 mutex_lock(&xps_map_mutex); 1952 dev_maps = xmap_dereference(dev->xps_maps); 1953 1954 if (!dev_maps) 1955 goto out_no_maps; 1956 1957 for_each_possible_cpu(cpu) { 1958 for (i = index; i < dev->num_tx_queues; i++) { 1959 if (!remove_xps_queue(dev_maps, cpu, i)) 1960 break; 1961 } 1962 if (i == dev->num_tx_queues) 1963 active = true; 1964 } 1965 1966 if (!active) { 1967 RCU_INIT_POINTER(dev->xps_maps, NULL); 1968 kfree_rcu(dev_maps, rcu); 1969 } 1970 1971 for (i = index; i < dev->num_tx_queues; i++) 1972 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 1973 NUMA_NO_NODE); 1974 1975 out_no_maps: 1976 mutex_unlock(&xps_map_mutex); 1977 } 1978 1979 static struct xps_map *expand_xps_map(struct xps_map *map, 1980 int cpu, u16 index) 1981 { 1982 struct xps_map *new_map; 1983 int alloc_len = XPS_MIN_MAP_ALLOC; 1984 int i, pos; 1985 1986 for (pos = 0; map && pos < map->len; pos++) { 1987 if (map->queues[pos] != index) 1988 continue; 1989 return map; 1990 } 1991 1992 /* Need to add queue to this CPU's existing map */ 1993 if (map) { 1994 if (pos < map->alloc_len) 1995 return map; 1996 1997 alloc_len = map->alloc_len * 2; 1998 } 1999 2000 /* Need to allocate new map to store queue on this CPU's map */ 2001 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 2002 cpu_to_node(cpu)); 2003 if (!new_map) 2004 return NULL; 2005 2006 for (i = 0; i < pos; i++) 2007 new_map->queues[i] = map->queues[i]; 2008 new_map->alloc_len = alloc_len; 2009 new_map->len = pos; 2010 2011 return new_map; 2012 } 2013 2014 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 2015 u16 index) 2016 { 2017 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2018 struct xps_map *map, *new_map; 2019 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); 2020 int cpu, numa_node_id = -2; 2021 bool active = false; 2022 2023 mutex_lock(&xps_map_mutex); 2024 2025 dev_maps = xmap_dereference(dev->xps_maps); 2026 2027 /* allocate memory for queue storage */ 2028 for_each_online_cpu(cpu) { 2029 if (!cpumask_test_cpu(cpu, mask)) 2030 continue; 2031 2032 if (!new_dev_maps) 2033 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2034 if (!new_dev_maps) { 2035 mutex_unlock(&xps_map_mutex); 2036 return -ENOMEM; 2037 } 2038 2039 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2040 NULL; 2041 2042 map = expand_xps_map(map, cpu, index); 2043 if (!map) 2044 goto error; 2045 2046 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2047 } 2048 2049 if (!new_dev_maps) 2050 goto out_no_new_maps; 2051 2052 for_each_possible_cpu(cpu) { 2053 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2054 /* add queue to CPU maps */ 2055 int pos = 0; 2056 2057 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2058 while ((pos < map->len) && (map->queues[pos] != index)) 2059 pos++; 2060 2061 if (pos == map->len) 2062 map->queues[map->len++] = index; 2063 #ifdef CONFIG_NUMA 2064 if (numa_node_id == -2) 2065 numa_node_id = cpu_to_node(cpu); 2066 else if (numa_node_id != cpu_to_node(cpu)) 2067 numa_node_id = -1; 2068 #endif 2069 } else if (dev_maps) { 2070 /* fill in the new device map from the old device map */ 2071 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2072 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2073 } 2074 2075 } 2076 2077 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2078 2079 /* Cleanup old maps */ 2080 if (dev_maps) { 2081 for_each_possible_cpu(cpu) { 2082 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2083 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2084 if (map && map != new_map) 2085 kfree_rcu(map, rcu); 2086 } 2087 2088 kfree_rcu(dev_maps, rcu); 2089 } 2090 2091 dev_maps = new_dev_maps; 2092 active = true; 2093 2094 out_no_new_maps: 2095 /* update Tx queue numa node */ 2096 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2097 (numa_node_id >= 0) ? numa_node_id : 2098 NUMA_NO_NODE); 2099 2100 if (!dev_maps) 2101 goto out_no_maps; 2102 2103 /* removes queue from unused CPUs */ 2104 for_each_possible_cpu(cpu) { 2105 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2106 continue; 2107 2108 if (remove_xps_queue(dev_maps, cpu, index)) 2109 active = true; 2110 } 2111 2112 /* free map if not active */ 2113 if (!active) { 2114 RCU_INIT_POINTER(dev->xps_maps, NULL); 2115 kfree_rcu(dev_maps, rcu); 2116 } 2117 2118 out_no_maps: 2119 mutex_unlock(&xps_map_mutex); 2120 2121 return 0; 2122 error: 2123 /* remove any maps that we added */ 2124 for_each_possible_cpu(cpu) { 2125 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2126 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2127 NULL; 2128 if (new_map && new_map != map) 2129 kfree(new_map); 2130 } 2131 2132 mutex_unlock(&xps_map_mutex); 2133 2134 kfree(new_dev_maps); 2135 return -ENOMEM; 2136 } 2137 EXPORT_SYMBOL(netif_set_xps_queue); 2138 2139 #endif 2140 /* 2141 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2142 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2143 */ 2144 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2145 { 2146 int rc; 2147 2148 if (txq < 1 || txq > dev->num_tx_queues) 2149 return -EINVAL; 2150 2151 if (dev->reg_state == NETREG_REGISTERED || 2152 dev->reg_state == NETREG_UNREGISTERING) { 2153 ASSERT_RTNL(); 2154 2155 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2156 txq); 2157 if (rc) 2158 return rc; 2159 2160 if (dev->num_tc) 2161 netif_setup_tc(dev, txq); 2162 2163 if (txq < dev->real_num_tx_queues) { 2164 qdisc_reset_all_tx_gt(dev, txq); 2165 #ifdef CONFIG_XPS 2166 netif_reset_xps_queues_gt(dev, txq); 2167 #endif 2168 } 2169 } 2170 2171 dev->real_num_tx_queues = txq; 2172 return 0; 2173 } 2174 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2175 2176 #ifdef CONFIG_SYSFS 2177 /** 2178 * netif_set_real_num_rx_queues - set actual number of RX queues used 2179 * @dev: Network device 2180 * @rxq: Actual number of RX queues 2181 * 2182 * This must be called either with the rtnl_lock held or before 2183 * registration of the net device. Returns 0 on success, or a 2184 * negative error code. If called before registration, it always 2185 * succeeds. 2186 */ 2187 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2188 { 2189 int rc; 2190 2191 if (rxq < 1 || rxq > dev->num_rx_queues) 2192 return -EINVAL; 2193 2194 if (dev->reg_state == NETREG_REGISTERED) { 2195 ASSERT_RTNL(); 2196 2197 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2198 rxq); 2199 if (rc) 2200 return rc; 2201 } 2202 2203 dev->real_num_rx_queues = rxq; 2204 return 0; 2205 } 2206 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2207 #endif 2208 2209 /** 2210 * netif_get_num_default_rss_queues - default number of RSS queues 2211 * 2212 * This routine should set an upper limit on the number of RSS queues 2213 * used by default by multiqueue devices. 2214 */ 2215 int netif_get_num_default_rss_queues(void) 2216 { 2217 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2218 } 2219 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2220 2221 static inline void __netif_reschedule(struct Qdisc *q) 2222 { 2223 struct softnet_data *sd; 2224 unsigned long flags; 2225 2226 local_irq_save(flags); 2227 sd = this_cpu_ptr(&softnet_data); 2228 q->next_sched = NULL; 2229 *sd->output_queue_tailp = q; 2230 sd->output_queue_tailp = &q->next_sched; 2231 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2232 local_irq_restore(flags); 2233 } 2234 2235 void __netif_schedule(struct Qdisc *q) 2236 { 2237 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2238 __netif_reschedule(q); 2239 } 2240 EXPORT_SYMBOL(__netif_schedule); 2241 2242 struct dev_kfree_skb_cb { 2243 enum skb_free_reason reason; 2244 }; 2245 2246 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2247 { 2248 return (struct dev_kfree_skb_cb *)skb->cb; 2249 } 2250 2251 void netif_schedule_queue(struct netdev_queue *txq) 2252 { 2253 rcu_read_lock(); 2254 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2255 struct Qdisc *q = rcu_dereference(txq->qdisc); 2256 2257 __netif_schedule(q); 2258 } 2259 rcu_read_unlock(); 2260 } 2261 EXPORT_SYMBOL(netif_schedule_queue); 2262 2263 /** 2264 * netif_wake_subqueue - allow sending packets on subqueue 2265 * @dev: network device 2266 * @queue_index: sub queue index 2267 * 2268 * Resume individual transmit queue of a device with multiple transmit queues. 2269 */ 2270 void netif_wake_subqueue(struct net_device *dev, u16 queue_index) 2271 { 2272 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); 2273 2274 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { 2275 struct Qdisc *q; 2276 2277 rcu_read_lock(); 2278 q = rcu_dereference(txq->qdisc); 2279 __netif_schedule(q); 2280 rcu_read_unlock(); 2281 } 2282 } 2283 EXPORT_SYMBOL(netif_wake_subqueue); 2284 2285 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2286 { 2287 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2288 struct Qdisc *q; 2289 2290 rcu_read_lock(); 2291 q = rcu_dereference(dev_queue->qdisc); 2292 __netif_schedule(q); 2293 rcu_read_unlock(); 2294 } 2295 } 2296 EXPORT_SYMBOL(netif_tx_wake_queue); 2297 2298 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2299 { 2300 unsigned long flags; 2301 2302 if (likely(atomic_read(&skb->users) == 1)) { 2303 smp_rmb(); 2304 atomic_set(&skb->users, 0); 2305 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2306 return; 2307 } 2308 get_kfree_skb_cb(skb)->reason = reason; 2309 local_irq_save(flags); 2310 skb->next = __this_cpu_read(softnet_data.completion_queue); 2311 __this_cpu_write(softnet_data.completion_queue, skb); 2312 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2313 local_irq_restore(flags); 2314 } 2315 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2316 2317 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2318 { 2319 if (in_irq() || irqs_disabled()) 2320 __dev_kfree_skb_irq(skb, reason); 2321 else 2322 dev_kfree_skb(skb); 2323 } 2324 EXPORT_SYMBOL(__dev_kfree_skb_any); 2325 2326 2327 /** 2328 * netif_device_detach - mark device as removed 2329 * @dev: network device 2330 * 2331 * Mark device as removed from system and therefore no longer available. 2332 */ 2333 void netif_device_detach(struct net_device *dev) 2334 { 2335 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2336 netif_running(dev)) { 2337 netif_tx_stop_all_queues(dev); 2338 } 2339 } 2340 EXPORT_SYMBOL(netif_device_detach); 2341 2342 /** 2343 * netif_device_attach - mark device as attached 2344 * @dev: network device 2345 * 2346 * Mark device as attached from system and restart if needed. 2347 */ 2348 void netif_device_attach(struct net_device *dev) 2349 { 2350 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2351 netif_running(dev)) { 2352 netif_tx_wake_all_queues(dev); 2353 __netdev_watchdog_up(dev); 2354 } 2355 } 2356 EXPORT_SYMBOL(netif_device_attach); 2357 2358 /* 2359 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2360 * to be used as a distribution range. 2361 */ 2362 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 2363 unsigned int num_tx_queues) 2364 { 2365 u32 hash; 2366 u16 qoffset = 0; 2367 u16 qcount = num_tx_queues; 2368 2369 if (skb_rx_queue_recorded(skb)) { 2370 hash = skb_get_rx_queue(skb); 2371 while (unlikely(hash >= num_tx_queues)) 2372 hash -= num_tx_queues; 2373 return hash; 2374 } 2375 2376 if (dev->num_tc) { 2377 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2378 qoffset = dev->tc_to_txq[tc].offset; 2379 qcount = dev->tc_to_txq[tc].count; 2380 } 2381 2382 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2383 } 2384 EXPORT_SYMBOL(__skb_tx_hash); 2385 2386 static void skb_warn_bad_offload(const struct sk_buff *skb) 2387 { 2388 static const netdev_features_t null_features = 0; 2389 struct net_device *dev = skb->dev; 2390 const char *driver = ""; 2391 2392 if (!net_ratelimit()) 2393 return; 2394 2395 if (dev && dev->dev.parent) 2396 driver = dev_driver_string(dev->dev.parent); 2397 2398 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2399 "gso_type=%d ip_summed=%d\n", 2400 driver, dev ? &dev->features : &null_features, 2401 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2402 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2403 skb_shinfo(skb)->gso_type, skb->ip_summed); 2404 } 2405 2406 /* 2407 * Invalidate hardware checksum when packet is to be mangled, and 2408 * complete checksum manually on outgoing path. 2409 */ 2410 int skb_checksum_help(struct sk_buff *skb) 2411 { 2412 __wsum csum; 2413 int ret = 0, offset; 2414 2415 if (skb->ip_summed == CHECKSUM_COMPLETE) 2416 goto out_set_summed; 2417 2418 if (unlikely(skb_shinfo(skb)->gso_size)) { 2419 skb_warn_bad_offload(skb); 2420 return -EINVAL; 2421 } 2422 2423 /* Before computing a checksum, we should make sure no frag could 2424 * be modified by an external entity : checksum could be wrong. 2425 */ 2426 if (skb_has_shared_frag(skb)) { 2427 ret = __skb_linearize(skb); 2428 if (ret) 2429 goto out; 2430 } 2431 2432 offset = skb_checksum_start_offset(skb); 2433 BUG_ON(offset >= skb_headlen(skb)); 2434 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2435 2436 offset += skb->csum_offset; 2437 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2438 2439 if (skb_cloned(skb) && 2440 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2441 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2442 if (ret) 2443 goto out; 2444 } 2445 2446 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2447 out_set_summed: 2448 skb->ip_summed = CHECKSUM_NONE; 2449 out: 2450 return ret; 2451 } 2452 EXPORT_SYMBOL(skb_checksum_help); 2453 2454 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2455 { 2456 __be16 type = skb->protocol; 2457 2458 /* Tunnel gso handlers can set protocol to ethernet. */ 2459 if (type == htons(ETH_P_TEB)) { 2460 struct ethhdr *eth; 2461 2462 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2463 return 0; 2464 2465 eth = (struct ethhdr *)skb_mac_header(skb); 2466 type = eth->h_proto; 2467 } 2468 2469 return __vlan_get_protocol(skb, type, depth); 2470 } 2471 2472 /** 2473 * skb_mac_gso_segment - mac layer segmentation handler. 2474 * @skb: buffer to segment 2475 * @features: features for the output path (see dev->features) 2476 */ 2477 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2478 netdev_features_t features) 2479 { 2480 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2481 struct packet_offload *ptype; 2482 int vlan_depth = skb->mac_len; 2483 __be16 type = skb_network_protocol(skb, &vlan_depth); 2484 2485 if (unlikely(!type)) 2486 return ERR_PTR(-EINVAL); 2487 2488 __skb_pull(skb, vlan_depth); 2489 2490 rcu_read_lock(); 2491 list_for_each_entry_rcu(ptype, &offload_base, list) { 2492 if (ptype->type == type && ptype->callbacks.gso_segment) { 2493 segs = ptype->callbacks.gso_segment(skb, features); 2494 break; 2495 } 2496 } 2497 rcu_read_unlock(); 2498 2499 __skb_push(skb, skb->data - skb_mac_header(skb)); 2500 2501 return segs; 2502 } 2503 EXPORT_SYMBOL(skb_mac_gso_segment); 2504 2505 2506 /* openvswitch calls this on rx path, so we need a different check. 2507 */ 2508 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2509 { 2510 if (tx_path) 2511 return skb->ip_summed != CHECKSUM_PARTIAL; 2512 else 2513 return skb->ip_summed == CHECKSUM_NONE; 2514 } 2515 2516 /** 2517 * __skb_gso_segment - Perform segmentation on skb. 2518 * @skb: buffer to segment 2519 * @features: features for the output path (see dev->features) 2520 * @tx_path: whether it is called in TX path 2521 * 2522 * This function segments the given skb and returns a list of segments. 2523 * 2524 * It may return NULL if the skb requires no segmentation. This is 2525 * only possible when GSO is used for verifying header integrity. 2526 */ 2527 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2528 netdev_features_t features, bool tx_path) 2529 { 2530 if (unlikely(skb_needs_check(skb, tx_path))) { 2531 int err; 2532 2533 skb_warn_bad_offload(skb); 2534 2535 err = skb_cow_head(skb, 0); 2536 if (err < 0) 2537 return ERR_PTR(err); 2538 } 2539 2540 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2541 SKB_GSO_CB(skb)->encap_level = 0; 2542 2543 skb_reset_mac_header(skb); 2544 skb_reset_mac_len(skb); 2545 2546 return skb_mac_gso_segment(skb, features); 2547 } 2548 EXPORT_SYMBOL(__skb_gso_segment); 2549 2550 /* Take action when hardware reception checksum errors are detected. */ 2551 #ifdef CONFIG_BUG 2552 void netdev_rx_csum_fault(struct net_device *dev) 2553 { 2554 if (net_ratelimit()) { 2555 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2556 dump_stack(); 2557 } 2558 } 2559 EXPORT_SYMBOL(netdev_rx_csum_fault); 2560 #endif 2561 2562 /* Actually, we should eliminate this check as soon as we know, that: 2563 * 1. IOMMU is present and allows to map all the memory. 2564 * 2. No high memory really exists on this machine. 2565 */ 2566 2567 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2568 { 2569 #ifdef CONFIG_HIGHMEM 2570 int i; 2571 if (!(dev->features & NETIF_F_HIGHDMA)) { 2572 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2573 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2574 if (PageHighMem(skb_frag_page(frag))) 2575 return 1; 2576 } 2577 } 2578 2579 if (PCI_DMA_BUS_IS_PHYS) { 2580 struct device *pdev = dev->dev.parent; 2581 2582 if (!pdev) 2583 return 0; 2584 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2585 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2586 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2587 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2588 return 1; 2589 } 2590 } 2591 #endif 2592 return 0; 2593 } 2594 2595 /* If MPLS offload request, verify we are testing hardware MPLS features 2596 * instead of standard features for the netdev. 2597 */ 2598 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2599 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2600 netdev_features_t features, 2601 __be16 type) 2602 { 2603 if (eth_p_mpls(type)) 2604 features &= skb->dev->mpls_features; 2605 2606 return features; 2607 } 2608 #else 2609 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2610 netdev_features_t features, 2611 __be16 type) 2612 { 2613 return features; 2614 } 2615 #endif 2616 2617 static netdev_features_t harmonize_features(struct sk_buff *skb, 2618 netdev_features_t features) 2619 { 2620 int tmp; 2621 __be16 type; 2622 2623 type = skb_network_protocol(skb, &tmp); 2624 features = net_mpls_features(skb, features, type); 2625 2626 if (skb->ip_summed != CHECKSUM_NONE && 2627 !can_checksum_protocol(features, type)) { 2628 features &= ~NETIF_F_ALL_CSUM; 2629 } else if (illegal_highdma(skb->dev, skb)) { 2630 features &= ~NETIF_F_SG; 2631 } 2632 2633 return features; 2634 } 2635 2636 netdev_features_t passthru_features_check(struct sk_buff *skb, 2637 struct net_device *dev, 2638 netdev_features_t features) 2639 { 2640 return features; 2641 } 2642 EXPORT_SYMBOL(passthru_features_check); 2643 2644 static netdev_features_t dflt_features_check(const struct sk_buff *skb, 2645 struct net_device *dev, 2646 netdev_features_t features) 2647 { 2648 return vlan_features_check(skb, features); 2649 } 2650 2651 netdev_features_t netif_skb_features(struct sk_buff *skb) 2652 { 2653 struct net_device *dev = skb->dev; 2654 netdev_features_t features = dev->features; 2655 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2656 2657 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) 2658 features &= ~NETIF_F_GSO_MASK; 2659 2660 /* If encapsulation offload request, verify we are testing 2661 * hardware encapsulation features instead of standard 2662 * features for the netdev 2663 */ 2664 if (skb->encapsulation) 2665 features &= dev->hw_enc_features; 2666 2667 if (skb_vlan_tagged(skb)) 2668 features = netdev_intersect_features(features, 2669 dev->vlan_features | 2670 NETIF_F_HW_VLAN_CTAG_TX | 2671 NETIF_F_HW_VLAN_STAG_TX); 2672 2673 if (dev->netdev_ops->ndo_features_check) 2674 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2675 features); 2676 else 2677 features &= dflt_features_check(skb, dev, features); 2678 2679 return harmonize_features(skb, features); 2680 } 2681 EXPORT_SYMBOL(netif_skb_features); 2682 2683 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2684 struct netdev_queue *txq, bool more) 2685 { 2686 unsigned int len; 2687 int rc; 2688 2689 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2690 dev_queue_xmit_nit(skb, dev); 2691 2692 len = skb->len; 2693 trace_net_dev_start_xmit(skb, dev); 2694 rc = netdev_start_xmit(skb, dev, txq, more); 2695 trace_net_dev_xmit(skb, rc, dev, len); 2696 2697 return rc; 2698 } 2699 2700 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2701 struct netdev_queue *txq, int *ret) 2702 { 2703 struct sk_buff *skb = first; 2704 int rc = NETDEV_TX_OK; 2705 2706 while (skb) { 2707 struct sk_buff *next = skb->next; 2708 2709 skb->next = NULL; 2710 rc = xmit_one(skb, dev, txq, next != NULL); 2711 if (unlikely(!dev_xmit_complete(rc))) { 2712 skb->next = next; 2713 goto out; 2714 } 2715 2716 skb = next; 2717 if (netif_xmit_stopped(txq) && skb) { 2718 rc = NETDEV_TX_BUSY; 2719 break; 2720 } 2721 } 2722 2723 out: 2724 *ret = rc; 2725 return skb; 2726 } 2727 2728 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2729 netdev_features_t features) 2730 { 2731 if (skb_vlan_tag_present(skb) && 2732 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2733 skb = __vlan_hwaccel_push_inside(skb); 2734 return skb; 2735 } 2736 2737 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2738 { 2739 netdev_features_t features; 2740 2741 if (skb->next) 2742 return skb; 2743 2744 features = netif_skb_features(skb); 2745 skb = validate_xmit_vlan(skb, features); 2746 if (unlikely(!skb)) 2747 goto out_null; 2748 2749 if (netif_needs_gso(skb, features)) { 2750 struct sk_buff *segs; 2751 2752 segs = skb_gso_segment(skb, features); 2753 if (IS_ERR(segs)) { 2754 goto out_kfree_skb; 2755 } else if (segs) { 2756 consume_skb(skb); 2757 skb = segs; 2758 } 2759 } else { 2760 if (skb_needs_linearize(skb, features) && 2761 __skb_linearize(skb)) 2762 goto out_kfree_skb; 2763 2764 /* If packet is not checksummed and device does not 2765 * support checksumming for this protocol, complete 2766 * checksumming here. 2767 */ 2768 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2769 if (skb->encapsulation) 2770 skb_set_inner_transport_header(skb, 2771 skb_checksum_start_offset(skb)); 2772 else 2773 skb_set_transport_header(skb, 2774 skb_checksum_start_offset(skb)); 2775 if (!(features & NETIF_F_ALL_CSUM) && 2776 skb_checksum_help(skb)) 2777 goto out_kfree_skb; 2778 } 2779 } 2780 2781 return skb; 2782 2783 out_kfree_skb: 2784 kfree_skb(skb); 2785 out_null: 2786 return NULL; 2787 } 2788 2789 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 2790 { 2791 struct sk_buff *next, *head = NULL, *tail; 2792 2793 for (; skb != NULL; skb = next) { 2794 next = skb->next; 2795 skb->next = NULL; 2796 2797 /* in case skb wont be segmented, point to itself */ 2798 skb->prev = skb; 2799 2800 skb = validate_xmit_skb(skb, dev); 2801 if (!skb) 2802 continue; 2803 2804 if (!head) 2805 head = skb; 2806 else 2807 tail->next = skb; 2808 /* If skb was segmented, skb->prev points to 2809 * the last segment. If not, it still contains skb. 2810 */ 2811 tail = skb->prev; 2812 } 2813 return head; 2814 } 2815 2816 static void qdisc_pkt_len_init(struct sk_buff *skb) 2817 { 2818 const struct skb_shared_info *shinfo = skb_shinfo(skb); 2819 2820 qdisc_skb_cb(skb)->pkt_len = skb->len; 2821 2822 /* To get more precise estimation of bytes sent on wire, 2823 * we add to pkt_len the headers size of all segments 2824 */ 2825 if (shinfo->gso_size) { 2826 unsigned int hdr_len; 2827 u16 gso_segs = shinfo->gso_segs; 2828 2829 /* mac layer + network layer */ 2830 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 2831 2832 /* + transport layer */ 2833 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 2834 hdr_len += tcp_hdrlen(skb); 2835 else 2836 hdr_len += sizeof(struct udphdr); 2837 2838 if (shinfo->gso_type & SKB_GSO_DODGY) 2839 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 2840 shinfo->gso_size); 2841 2842 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 2843 } 2844 } 2845 2846 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 2847 struct net_device *dev, 2848 struct netdev_queue *txq) 2849 { 2850 spinlock_t *root_lock = qdisc_lock(q); 2851 bool contended; 2852 int rc; 2853 2854 qdisc_pkt_len_init(skb); 2855 qdisc_calculate_pkt_len(skb, q); 2856 /* 2857 * Heuristic to force contended enqueues to serialize on a 2858 * separate lock before trying to get qdisc main lock. 2859 * This permits __QDISC___STATE_RUNNING owner to get the lock more 2860 * often and dequeue packets faster. 2861 */ 2862 contended = qdisc_is_running(q); 2863 if (unlikely(contended)) 2864 spin_lock(&q->busylock); 2865 2866 spin_lock(root_lock); 2867 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 2868 kfree_skb(skb); 2869 rc = NET_XMIT_DROP; 2870 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 2871 qdisc_run_begin(q)) { 2872 /* 2873 * This is a work-conserving queue; there are no old skbs 2874 * waiting to be sent out; and the qdisc is not running - 2875 * xmit the skb directly. 2876 */ 2877 2878 qdisc_bstats_update(q, skb); 2879 2880 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 2881 if (unlikely(contended)) { 2882 spin_unlock(&q->busylock); 2883 contended = false; 2884 } 2885 __qdisc_run(q); 2886 } else 2887 qdisc_run_end(q); 2888 2889 rc = NET_XMIT_SUCCESS; 2890 } else { 2891 rc = q->enqueue(skb, q) & NET_XMIT_MASK; 2892 if (qdisc_run_begin(q)) { 2893 if (unlikely(contended)) { 2894 spin_unlock(&q->busylock); 2895 contended = false; 2896 } 2897 __qdisc_run(q); 2898 } 2899 } 2900 spin_unlock(root_lock); 2901 if (unlikely(contended)) 2902 spin_unlock(&q->busylock); 2903 return rc; 2904 } 2905 2906 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 2907 static void skb_update_prio(struct sk_buff *skb) 2908 { 2909 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 2910 2911 if (!skb->priority && skb->sk && map) { 2912 unsigned int prioidx = skb->sk->sk_cgrp_prioidx; 2913 2914 if (prioidx < map->priomap_len) 2915 skb->priority = map->priomap[prioidx]; 2916 } 2917 } 2918 #else 2919 #define skb_update_prio(skb) 2920 #endif 2921 2922 DEFINE_PER_CPU(int, xmit_recursion); 2923 EXPORT_SYMBOL(xmit_recursion); 2924 2925 #define RECURSION_LIMIT 10 2926 2927 /** 2928 * dev_loopback_xmit - loop back @skb 2929 * @skb: buffer to transmit 2930 */ 2931 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb) 2932 { 2933 skb_reset_mac_header(skb); 2934 __skb_pull(skb, skb_network_offset(skb)); 2935 skb->pkt_type = PACKET_LOOPBACK; 2936 skb->ip_summed = CHECKSUM_UNNECESSARY; 2937 WARN_ON(!skb_dst(skb)); 2938 skb_dst_force(skb); 2939 netif_rx_ni(skb); 2940 return 0; 2941 } 2942 EXPORT_SYMBOL(dev_loopback_xmit); 2943 2944 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 2945 { 2946 #ifdef CONFIG_XPS 2947 struct xps_dev_maps *dev_maps; 2948 struct xps_map *map; 2949 int queue_index = -1; 2950 2951 rcu_read_lock(); 2952 dev_maps = rcu_dereference(dev->xps_maps); 2953 if (dev_maps) { 2954 map = rcu_dereference( 2955 dev_maps->cpu_map[skb->sender_cpu - 1]); 2956 if (map) { 2957 if (map->len == 1) 2958 queue_index = map->queues[0]; 2959 else 2960 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 2961 map->len)]; 2962 if (unlikely(queue_index >= dev->real_num_tx_queues)) 2963 queue_index = -1; 2964 } 2965 } 2966 rcu_read_unlock(); 2967 2968 return queue_index; 2969 #else 2970 return -1; 2971 #endif 2972 } 2973 2974 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 2975 { 2976 struct sock *sk = skb->sk; 2977 int queue_index = sk_tx_queue_get(sk); 2978 2979 if (queue_index < 0 || skb->ooo_okay || 2980 queue_index >= dev->real_num_tx_queues) { 2981 int new_index = get_xps_queue(dev, skb); 2982 if (new_index < 0) 2983 new_index = skb_tx_hash(dev, skb); 2984 2985 if (queue_index != new_index && sk && 2986 rcu_access_pointer(sk->sk_dst_cache)) 2987 sk_tx_queue_set(sk, new_index); 2988 2989 queue_index = new_index; 2990 } 2991 2992 return queue_index; 2993 } 2994 2995 struct netdev_queue *netdev_pick_tx(struct net_device *dev, 2996 struct sk_buff *skb, 2997 void *accel_priv) 2998 { 2999 int queue_index = 0; 3000 3001 #ifdef CONFIG_XPS 3002 if (skb->sender_cpu == 0) 3003 skb->sender_cpu = raw_smp_processor_id() + 1; 3004 #endif 3005 3006 if (dev->real_num_tx_queues != 1) { 3007 const struct net_device_ops *ops = dev->netdev_ops; 3008 if (ops->ndo_select_queue) 3009 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3010 __netdev_pick_tx); 3011 else 3012 queue_index = __netdev_pick_tx(dev, skb); 3013 3014 if (!accel_priv) 3015 queue_index = netdev_cap_txqueue(dev, queue_index); 3016 } 3017 3018 skb_set_queue_mapping(skb, queue_index); 3019 return netdev_get_tx_queue(dev, queue_index); 3020 } 3021 3022 /** 3023 * __dev_queue_xmit - transmit a buffer 3024 * @skb: buffer to transmit 3025 * @accel_priv: private data used for L2 forwarding offload 3026 * 3027 * Queue a buffer for transmission to a network device. The caller must 3028 * have set the device and priority and built the buffer before calling 3029 * this function. The function can be called from an interrupt. 3030 * 3031 * A negative errno code is returned on a failure. A success does not 3032 * guarantee the frame will be transmitted as it may be dropped due 3033 * to congestion or traffic shaping. 3034 * 3035 * ----------------------------------------------------------------------------------- 3036 * I notice this method can also return errors from the queue disciplines, 3037 * including NET_XMIT_DROP, which is a positive value. So, errors can also 3038 * be positive. 3039 * 3040 * Regardless of the return value, the skb is consumed, so it is currently 3041 * difficult to retry a send to this method. (You can bump the ref count 3042 * before sending to hold a reference for retry if you are careful.) 3043 * 3044 * When calling this method, interrupts MUST be enabled. This is because 3045 * the BH enable code must have IRQs enabled so that it will not deadlock. 3046 * --BLG 3047 */ 3048 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 3049 { 3050 struct net_device *dev = skb->dev; 3051 struct netdev_queue *txq; 3052 struct Qdisc *q; 3053 int rc = -ENOMEM; 3054 3055 skb_reset_mac_header(skb); 3056 3057 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 3058 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 3059 3060 /* Disable soft irqs for various locks below. Also 3061 * stops preemption for RCU. 3062 */ 3063 rcu_read_lock_bh(); 3064 3065 skb_update_prio(skb); 3066 3067 /* If device/qdisc don't need skb->dst, release it right now while 3068 * its hot in this cpu cache. 3069 */ 3070 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 3071 skb_dst_drop(skb); 3072 else 3073 skb_dst_force(skb); 3074 3075 txq = netdev_pick_tx(dev, skb, accel_priv); 3076 q = rcu_dereference_bh(txq->qdisc); 3077 3078 #ifdef CONFIG_NET_CLS_ACT 3079 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 3080 #endif 3081 trace_net_dev_queue(skb); 3082 if (q->enqueue) { 3083 rc = __dev_xmit_skb(skb, q, dev, txq); 3084 goto out; 3085 } 3086 3087 /* The device has no queue. Common case for software devices: 3088 loopback, all the sorts of tunnels... 3089 3090 Really, it is unlikely that netif_tx_lock protection is necessary 3091 here. (f.e. loopback and IP tunnels are clean ignoring statistics 3092 counters.) 3093 However, it is possible, that they rely on protection 3094 made by us here. 3095 3096 Check this and shot the lock. It is not prone from deadlocks. 3097 Either shot noqueue qdisc, it is even simpler 8) 3098 */ 3099 if (dev->flags & IFF_UP) { 3100 int cpu = smp_processor_id(); /* ok because BHs are off */ 3101 3102 if (txq->xmit_lock_owner != cpu) { 3103 3104 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) 3105 goto recursion_alert; 3106 3107 skb = validate_xmit_skb(skb, dev); 3108 if (!skb) 3109 goto drop; 3110 3111 HARD_TX_LOCK(dev, txq, cpu); 3112 3113 if (!netif_xmit_stopped(txq)) { 3114 __this_cpu_inc(xmit_recursion); 3115 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 3116 __this_cpu_dec(xmit_recursion); 3117 if (dev_xmit_complete(rc)) { 3118 HARD_TX_UNLOCK(dev, txq); 3119 goto out; 3120 } 3121 } 3122 HARD_TX_UNLOCK(dev, txq); 3123 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 3124 dev->name); 3125 } else { 3126 /* Recursion is detected! It is possible, 3127 * unfortunately 3128 */ 3129 recursion_alert: 3130 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 3131 dev->name); 3132 } 3133 } 3134 3135 rc = -ENETDOWN; 3136 drop: 3137 rcu_read_unlock_bh(); 3138 3139 atomic_long_inc(&dev->tx_dropped); 3140 kfree_skb_list(skb); 3141 return rc; 3142 out: 3143 rcu_read_unlock_bh(); 3144 return rc; 3145 } 3146 3147 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb) 3148 { 3149 return __dev_queue_xmit(skb, NULL); 3150 } 3151 EXPORT_SYMBOL(dev_queue_xmit_sk); 3152 3153 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3154 { 3155 return __dev_queue_xmit(skb, accel_priv); 3156 } 3157 EXPORT_SYMBOL(dev_queue_xmit_accel); 3158 3159 3160 /*======================================================================= 3161 Receiver routines 3162 =======================================================================*/ 3163 3164 int netdev_max_backlog __read_mostly = 1000; 3165 EXPORT_SYMBOL(netdev_max_backlog); 3166 3167 int netdev_tstamp_prequeue __read_mostly = 1; 3168 int netdev_budget __read_mostly = 300; 3169 int weight_p __read_mostly = 64; /* old backlog weight */ 3170 3171 /* Called with irq disabled */ 3172 static inline void ____napi_schedule(struct softnet_data *sd, 3173 struct napi_struct *napi) 3174 { 3175 list_add_tail(&napi->poll_list, &sd->poll_list); 3176 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3177 } 3178 3179 #ifdef CONFIG_RPS 3180 3181 /* One global table that all flow-based protocols share. */ 3182 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3183 EXPORT_SYMBOL(rps_sock_flow_table); 3184 u32 rps_cpu_mask __read_mostly; 3185 EXPORT_SYMBOL(rps_cpu_mask); 3186 3187 struct static_key rps_needed __read_mostly; 3188 3189 static struct rps_dev_flow * 3190 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3191 struct rps_dev_flow *rflow, u16 next_cpu) 3192 { 3193 if (next_cpu < nr_cpu_ids) { 3194 #ifdef CONFIG_RFS_ACCEL 3195 struct netdev_rx_queue *rxqueue; 3196 struct rps_dev_flow_table *flow_table; 3197 struct rps_dev_flow *old_rflow; 3198 u32 flow_id; 3199 u16 rxq_index; 3200 int rc; 3201 3202 /* Should we steer this flow to a different hardware queue? */ 3203 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3204 !(dev->features & NETIF_F_NTUPLE)) 3205 goto out; 3206 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3207 if (rxq_index == skb_get_rx_queue(skb)) 3208 goto out; 3209 3210 rxqueue = dev->_rx + rxq_index; 3211 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3212 if (!flow_table) 3213 goto out; 3214 flow_id = skb_get_hash(skb) & flow_table->mask; 3215 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3216 rxq_index, flow_id); 3217 if (rc < 0) 3218 goto out; 3219 old_rflow = rflow; 3220 rflow = &flow_table->flows[flow_id]; 3221 rflow->filter = rc; 3222 if (old_rflow->filter == rflow->filter) 3223 old_rflow->filter = RPS_NO_FILTER; 3224 out: 3225 #endif 3226 rflow->last_qtail = 3227 per_cpu(softnet_data, next_cpu).input_queue_head; 3228 } 3229 3230 rflow->cpu = next_cpu; 3231 return rflow; 3232 } 3233 3234 /* 3235 * get_rps_cpu is called from netif_receive_skb and returns the target 3236 * CPU from the RPS map of the receiving queue for a given skb. 3237 * rcu_read_lock must be held on entry. 3238 */ 3239 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3240 struct rps_dev_flow **rflowp) 3241 { 3242 const struct rps_sock_flow_table *sock_flow_table; 3243 struct netdev_rx_queue *rxqueue = dev->_rx; 3244 struct rps_dev_flow_table *flow_table; 3245 struct rps_map *map; 3246 int cpu = -1; 3247 u32 tcpu; 3248 u32 hash; 3249 3250 if (skb_rx_queue_recorded(skb)) { 3251 u16 index = skb_get_rx_queue(skb); 3252 3253 if (unlikely(index >= dev->real_num_rx_queues)) { 3254 WARN_ONCE(dev->real_num_rx_queues > 1, 3255 "%s received packet on queue %u, but number " 3256 "of RX queues is %u\n", 3257 dev->name, index, dev->real_num_rx_queues); 3258 goto done; 3259 } 3260 rxqueue += index; 3261 } 3262 3263 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3264 3265 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3266 map = rcu_dereference(rxqueue->rps_map); 3267 if (!flow_table && !map) 3268 goto done; 3269 3270 skb_reset_network_header(skb); 3271 hash = skb_get_hash(skb); 3272 if (!hash) 3273 goto done; 3274 3275 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3276 if (flow_table && sock_flow_table) { 3277 struct rps_dev_flow *rflow; 3278 u32 next_cpu; 3279 u32 ident; 3280 3281 /* First check into global flow table if there is a match */ 3282 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3283 if ((ident ^ hash) & ~rps_cpu_mask) 3284 goto try_rps; 3285 3286 next_cpu = ident & rps_cpu_mask; 3287 3288 /* OK, now we know there is a match, 3289 * we can look at the local (per receive queue) flow table 3290 */ 3291 rflow = &flow_table->flows[hash & flow_table->mask]; 3292 tcpu = rflow->cpu; 3293 3294 /* 3295 * If the desired CPU (where last recvmsg was done) is 3296 * different from current CPU (one in the rx-queue flow 3297 * table entry), switch if one of the following holds: 3298 * - Current CPU is unset (>= nr_cpu_ids). 3299 * - Current CPU is offline. 3300 * - The current CPU's queue tail has advanced beyond the 3301 * last packet that was enqueued using this table entry. 3302 * This guarantees that all previous packets for the flow 3303 * have been dequeued, thus preserving in order delivery. 3304 */ 3305 if (unlikely(tcpu != next_cpu) && 3306 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || 3307 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3308 rflow->last_qtail)) >= 0)) { 3309 tcpu = next_cpu; 3310 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3311 } 3312 3313 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { 3314 *rflowp = rflow; 3315 cpu = tcpu; 3316 goto done; 3317 } 3318 } 3319 3320 try_rps: 3321 3322 if (map) { 3323 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3324 if (cpu_online(tcpu)) { 3325 cpu = tcpu; 3326 goto done; 3327 } 3328 } 3329 3330 done: 3331 return cpu; 3332 } 3333 3334 #ifdef CONFIG_RFS_ACCEL 3335 3336 /** 3337 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3338 * @dev: Device on which the filter was set 3339 * @rxq_index: RX queue index 3340 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3341 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3342 * 3343 * Drivers that implement ndo_rx_flow_steer() should periodically call 3344 * this function for each installed filter and remove the filters for 3345 * which it returns %true. 3346 */ 3347 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3348 u32 flow_id, u16 filter_id) 3349 { 3350 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3351 struct rps_dev_flow_table *flow_table; 3352 struct rps_dev_flow *rflow; 3353 bool expire = true; 3354 unsigned int cpu; 3355 3356 rcu_read_lock(); 3357 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3358 if (flow_table && flow_id <= flow_table->mask) { 3359 rflow = &flow_table->flows[flow_id]; 3360 cpu = ACCESS_ONCE(rflow->cpu); 3361 if (rflow->filter == filter_id && cpu < nr_cpu_ids && 3362 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3363 rflow->last_qtail) < 3364 (int)(10 * flow_table->mask))) 3365 expire = false; 3366 } 3367 rcu_read_unlock(); 3368 return expire; 3369 } 3370 EXPORT_SYMBOL(rps_may_expire_flow); 3371 3372 #endif /* CONFIG_RFS_ACCEL */ 3373 3374 /* Called from hardirq (IPI) context */ 3375 static void rps_trigger_softirq(void *data) 3376 { 3377 struct softnet_data *sd = data; 3378 3379 ____napi_schedule(sd, &sd->backlog); 3380 sd->received_rps++; 3381 } 3382 3383 #endif /* CONFIG_RPS */ 3384 3385 /* 3386 * Check if this softnet_data structure is another cpu one 3387 * If yes, queue it to our IPI list and return 1 3388 * If no, return 0 3389 */ 3390 static int rps_ipi_queued(struct softnet_data *sd) 3391 { 3392 #ifdef CONFIG_RPS 3393 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3394 3395 if (sd != mysd) { 3396 sd->rps_ipi_next = mysd->rps_ipi_list; 3397 mysd->rps_ipi_list = sd; 3398 3399 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3400 return 1; 3401 } 3402 #endif /* CONFIG_RPS */ 3403 return 0; 3404 } 3405 3406 #ifdef CONFIG_NET_FLOW_LIMIT 3407 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3408 #endif 3409 3410 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3411 { 3412 #ifdef CONFIG_NET_FLOW_LIMIT 3413 struct sd_flow_limit *fl; 3414 struct softnet_data *sd; 3415 unsigned int old_flow, new_flow; 3416 3417 if (qlen < (netdev_max_backlog >> 1)) 3418 return false; 3419 3420 sd = this_cpu_ptr(&softnet_data); 3421 3422 rcu_read_lock(); 3423 fl = rcu_dereference(sd->flow_limit); 3424 if (fl) { 3425 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3426 old_flow = fl->history[fl->history_head]; 3427 fl->history[fl->history_head] = new_flow; 3428 3429 fl->history_head++; 3430 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3431 3432 if (likely(fl->buckets[old_flow])) 3433 fl->buckets[old_flow]--; 3434 3435 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3436 fl->count++; 3437 rcu_read_unlock(); 3438 return true; 3439 } 3440 } 3441 rcu_read_unlock(); 3442 #endif 3443 return false; 3444 } 3445 3446 /* 3447 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3448 * queue (may be a remote CPU queue). 3449 */ 3450 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3451 unsigned int *qtail) 3452 { 3453 struct softnet_data *sd; 3454 unsigned long flags; 3455 unsigned int qlen; 3456 3457 sd = &per_cpu(softnet_data, cpu); 3458 3459 local_irq_save(flags); 3460 3461 rps_lock(sd); 3462 qlen = skb_queue_len(&sd->input_pkt_queue); 3463 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3464 if (qlen) { 3465 enqueue: 3466 __skb_queue_tail(&sd->input_pkt_queue, skb); 3467 input_queue_tail_incr_save(sd, qtail); 3468 rps_unlock(sd); 3469 local_irq_restore(flags); 3470 return NET_RX_SUCCESS; 3471 } 3472 3473 /* Schedule NAPI for backlog device 3474 * We can use non atomic operation since we own the queue lock 3475 */ 3476 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3477 if (!rps_ipi_queued(sd)) 3478 ____napi_schedule(sd, &sd->backlog); 3479 } 3480 goto enqueue; 3481 } 3482 3483 sd->dropped++; 3484 rps_unlock(sd); 3485 3486 local_irq_restore(flags); 3487 3488 atomic_long_inc(&skb->dev->rx_dropped); 3489 kfree_skb(skb); 3490 return NET_RX_DROP; 3491 } 3492 3493 static int netif_rx_internal(struct sk_buff *skb) 3494 { 3495 int ret; 3496 3497 net_timestamp_check(netdev_tstamp_prequeue, skb); 3498 3499 trace_netif_rx(skb); 3500 #ifdef CONFIG_RPS 3501 if (static_key_false(&rps_needed)) { 3502 struct rps_dev_flow voidflow, *rflow = &voidflow; 3503 int cpu; 3504 3505 preempt_disable(); 3506 rcu_read_lock(); 3507 3508 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3509 if (cpu < 0) 3510 cpu = smp_processor_id(); 3511 3512 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3513 3514 rcu_read_unlock(); 3515 preempt_enable(); 3516 } else 3517 #endif 3518 { 3519 unsigned int qtail; 3520 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3521 put_cpu(); 3522 } 3523 return ret; 3524 } 3525 3526 /** 3527 * netif_rx - post buffer to the network code 3528 * @skb: buffer to post 3529 * 3530 * This function receives a packet from a device driver and queues it for 3531 * the upper (protocol) levels to process. It always succeeds. The buffer 3532 * may be dropped during processing for congestion control or by the 3533 * protocol layers. 3534 * 3535 * return values: 3536 * NET_RX_SUCCESS (no congestion) 3537 * NET_RX_DROP (packet was dropped) 3538 * 3539 */ 3540 3541 int netif_rx(struct sk_buff *skb) 3542 { 3543 trace_netif_rx_entry(skb); 3544 3545 return netif_rx_internal(skb); 3546 } 3547 EXPORT_SYMBOL(netif_rx); 3548 3549 int netif_rx_ni(struct sk_buff *skb) 3550 { 3551 int err; 3552 3553 trace_netif_rx_ni_entry(skb); 3554 3555 preempt_disable(); 3556 err = netif_rx_internal(skb); 3557 if (local_softirq_pending()) 3558 do_softirq(); 3559 preempt_enable(); 3560 3561 return err; 3562 } 3563 EXPORT_SYMBOL(netif_rx_ni); 3564 3565 static void net_tx_action(struct softirq_action *h) 3566 { 3567 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3568 3569 if (sd->completion_queue) { 3570 struct sk_buff *clist; 3571 3572 local_irq_disable(); 3573 clist = sd->completion_queue; 3574 sd->completion_queue = NULL; 3575 local_irq_enable(); 3576 3577 while (clist) { 3578 struct sk_buff *skb = clist; 3579 clist = clist->next; 3580 3581 WARN_ON(atomic_read(&skb->users)); 3582 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3583 trace_consume_skb(skb); 3584 else 3585 trace_kfree_skb(skb, net_tx_action); 3586 __kfree_skb(skb); 3587 } 3588 } 3589 3590 if (sd->output_queue) { 3591 struct Qdisc *head; 3592 3593 local_irq_disable(); 3594 head = sd->output_queue; 3595 sd->output_queue = NULL; 3596 sd->output_queue_tailp = &sd->output_queue; 3597 local_irq_enable(); 3598 3599 while (head) { 3600 struct Qdisc *q = head; 3601 spinlock_t *root_lock; 3602 3603 head = head->next_sched; 3604 3605 root_lock = qdisc_lock(q); 3606 if (spin_trylock(root_lock)) { 3607 smp_mb__before_atomic(); 3608 clear_bit(__QDISC_STATE_SCHED, 3609 &q->state); 3610 qdisc_run(q); 3611 spin_unlock(root_lock); 3612 } else { 3613 if (!test_bit(__QDISC_STATE_DEACTIVATED, 3614 &q->state)) { 3615 __netif_reschedule(q); 3616 } else { 3617 smp_mb__before_atomic(); 3618 clear_bit(__QDISC_STATE_SCHED, 3619 &q->state); 3620 } 3621 } 3622 } 3623 } 3624 } 3625 3626 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 3627 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 3628 /* This hook is defined here for ATM LANE */ 3629 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3630 unsigned char *addr) __read_mostly; 3631 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3632 #endif 3633 3634 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 3635 struct packet_type **pt_prev, 3636 int *ret, struct net_device *orig_dev) 3637 { 3638 #ifdef CONFIG_NET_CLS_ACT 3639 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); 3640 struct tcf_result cl_res; 3641 3642 /* If there's at least one ingress present somewhere (so 3643 * we get here via enabled static key), remaining devices 3644 * that are not configured with an ingress qdisc will bail 3645 * out here. 3646 */ 3647 if (!cl) 3648 return skb; 3649 if (*pt_prev) { 3650 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3651 *pt_prev = NULL; 3652 } 3653 3654 qdisc_skb_cb(skb)->pkt_len = skb->len; 3655 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3656 qdisc_bstats_update_cpu(cl->q, skb); 3657 3658 switch (tc_classify(skb, cl, &cl_res)) { 3659 case TC_ACT_OK: 3660 case TC_ACT_RECLASSIFY: 3661 skb->tc_index = TC_H_MIN(cl_res.classid); 3662 break; 3663 case TC_ACT_SHOT: 3664 qdisc_qstats_drop_cpu(cl->q); 3665 case TC_ACT_STOLEN: 3666 case TC_ACT_QUEUED: 3667 kfree_skb(skb); 3668 return NULL; 3669 default: 3670 break; 3671 } 3672 #endif /* CONFIG_NET_CLS_ACT */ 3673 return skb; 3674 } 3675 3676 /** 3677 * netdev_rx_handler_register - register receive handler 3678 * @dev: device to register a handler for 3679 * @rx_handler: receive handler to register 3680 * @rx_handler_data: data pointer that is used by rx handler 3681 * 3682 * Register a receive handler for a device. This handler will then be 3683 * called from __netif_receive_skb. A negative errno code is returned 3684 * on a failure. 3685 * 3686 * The caller must hold the rtnl_mutex. 3687 * 3688 * For a general description of rx_handler, see enum rx_handler_result. 3689 */ 3690 int netdev_rx_handler_register(struct net_device *dev, 3691 rx_handler_func_t *rx_handler, 3692 void *rx_handler_data) 3693 { 3694 ASSERT_RTNL(); 3695 3696 if (dev->rx_handler) 3697 return -EBUSY; 3698 3699 /* Note: rx_handler_data must be set before rx_handler */ 3700 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 3701 rcu_assign_pointer(dev->rx_handler, rx_handler); 3702 3703 return 0; 3704 } 3705 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 3706 3707 /** 3708 * netdev_rx_handler_unregister - unregister receive handler 3709 * @dev: device to unregister a handler from 3710 * 3711 * Unregister a receive handler from a device. 3712 * 3713 * The caller must hold the rtnl_mutex. 3714 */ 3715 void netdev_rx_handler_unregister(struct net_device *dev) 3716 { 3717 3718 ASSERT_RTNL(); 3719 RCU_INIT_POINTER(dev->rx_handler, NULL); 3720 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 3721 * section has a guarantee to see a non NULL rx_handler_data 3722 * as well. 3723 */ 3724 synchronize_net(); 3725 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 3726 } 3727 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3728 3729 /* 3730 * Limit the use of PFMEMALLOC reserves to those protocols that implement 3731 * the special handling of PFMEMALLOC skbs. 3732 */ 3733 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 3734 { 3735 switch (skb->protocol) { 3736 case htons(ETH_P_ARP): 3737 case htons(ETH_P_IP): 3738 case htons(ETH_P_IPV6): 3739 case htons(ETH_P_8021Q): 3740 case htons(ETH_P_8021AD): 3741 return true; 3742 default: 3743 return false; 3744 } 3745 } 3746 3747 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, 3748 int *ret, struct net_device *orig_dev) 3749 { 3750 #ifdef CONFIG_NETFILTER_INGRESS 3751 if (nf_hook_ingress_active(skb)) { 3752 if (*pt_prev) { 3753 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3754 *pt_prev = NULL; 3755 } 3756 3757 return nf_hook_ingress(skb); 3758 } 3759 #endif /* CONFIG_NETFILTER_INGRESS */ 3760 return 0; 3761 } 3762 3763 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 3764 { 3765 struct packet_type *ptype, *pt_prev; 3766 rx_handler_func_t *rx_handler; 3767 struct net_device *orig_dev; 3768 bool deliver_exact = false; 3769 int ret = NET_RX_DROP; 3770 __be16 type; 3771 3772 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3773 3774 trace_netif_receive_skb(skb); 3775 3776 orig_dev = skb->dev; 3777 3778 skb_reset_network_header(skb); 3779 if (!skb_transport_header_was_set(skb)) 3780 skb_reset_transport_header(skb); 3781 skb_reset_mac_len(skb); 3782 3783 pt_prev = NULL; 3784 3785 rcu_read_lock(); 3786 3787 another_round: 3788 skb->skb_iif = skb->dev->ifindex; 3789 3790 __this_cpu_inc(softnet_data.processed); 3791 3792 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 3793 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 3794 skb = skb_vlan_untag(skb); 3795 if (unlikely(!skb)) 3796 goto unlock; 3797 } 3798 3799 #ifdef CONFIG_NET_CLS_ACT 3800 if (skb->tc_verd & TC_NCLS) { 3801 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 3802 goto ncls; 3803 } 3804 #endif 3805 3806 if (pfmemalloc) 3807 goto skip_taps; 3808 3809 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3810 if (pt_prev) 3811 ret = deliver_skb(skb, pt_prev, orig_dev); 3812 pt_prev = ptype; 3813 } 3814 3815 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 3816 if (pt_prev) 3817 ret = deliver_skb(skb, pt_prev, orig_dev); 3818 pt_prev = ptype; 3819 } 3820 3821 skip_taps: 3822 #ifdef CONFIG_NET_INGRESS 3823 if (static_key_false(&ingress_needed)) { 3824 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 3825 if (!skb) 3826 goto unlock; 3827 3828 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) 3829 goto unlock; 3830 } 3831 #endif 3832 #ifdef CONFIG_NET_CLS_ACT 3833 skb->tc_verd = 0; 3834 ncls: 3835 #endif 3836 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 3837 goto drop; 3838 3839 if (skb_vlan_tag_present(skb)) { 3840 if (pt_prev) { 3841 ret = deliver_skb(skb, pt_prev, orig_dev); 3842 pt_prev = NULL; 3843 } 3844 if (vlan_do_receive(&skb)) 3845 goto another_round; 3846 else if (unlikely(!skb)) 3847 goto unlock; 3848 } 3849 3850 rx_handler = rcu_dereference(skb->dev->rx_handler); 3851 if (rx_handler) { 3852 if (pt_prev) { 3853 ret = deliver_skb(skb, pt_prev, orig_dev); 3854 pt_prev = NULL; 3855 } 3856 switch (rx_handler(&skb)) { 3857 case RX_HANDLER_CONSUMED: 3858 ret = NET_RX_SUCCESS; 3859 goto unlock; 3860 case RX_HANDLER_ANOTHER: 3861 goto another_round; 3862 case RX_HANDLER_EXACT: 3863 deliver_exact = true; 3864 case RX_HANDLER_PASS: 3865 break; 3866 default: 3867 BUG(); 3868 } 3869 } 3870 3871 if (unlikely(skb_vlan_tag_present(skb))) { 3872 if (skb_vlan_tag_get_id(skb)) 3873 skb->pkt_type = PACKET_OTHERHOST; 3874 /* Note: we might in the future use prio bits 3875 * and set skb->priority like in vlan_do_receive() 3876 * For the time being, just ignore Priority Code Point 3877 */ 3878 skb->vlan_tci = 0; 3879 } 3880 3881 type = skb->protocol; 3882 3883 /* deliver only exact match when indicated */ 3884 if (likely(!deliver_exact)) { 3885 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3886 &ptype_base[ntohs(type) & 3887 PTYPE_HASH_MASK]); 3888 } 3889 3890 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3891 &orig_dev->ptype_specific); 3892 3893 if (unlikely(skb->dev != orig_dev)) { 3894 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3895 &skb->dev->ptype_specific); 3896 } 3897 3898 if (pt_prev) { 3899 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 3900 goto drop; 3901 else 3902 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3903 } else { 3904 drop: 3905 atomic_long_inc(&skb->dev->rx_dropped); 3906 kfree_skb(skb); 3907 /* Jamal, now you will not able to escape explaining 3908 * me how you were going to use this. :-) 3909 */ 3910 ret = NET_RX_DROP; 3911 } 3912 3913 unlock: 3914 rcu_read_unlock(); 3915 return ret; 3916 } 3917 3918 static int __netif_receive_skb(struct sk_buff *skb) 3919 { 3920 int ret; 3921 3922 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 3923 unsigned long pflags = current->flags; 3924 3925 /* 3926 * PFMEMALLOC skbs are special, they should 3927 * - be delivered to SOCK_MEMALLOC sockets only 3928 * - stay away from userspace 3929 * - have bounded memory usage 3930 * 3931 * Use PF_MEMALLOC as this saves us from propagating the allocation 3932 * context down to all allocation sites. 3933 */ 3934 current->flags |= PF_MEMALLOC; 3935 ret = __netif_receive_skb_core(skb, true); 3936 tsk_restore_flags(current, pflags, PF_MEMALLOC); 3937 } else 3938 ret = __netif_receive_skb_core(skb, false); 3939 3940 return ret; 3941 } 3942 3943 static int netif_receive_skb_internal(struct sk_buff *skb) 3944 { 3945 net_timestamp_check(netdev_tstamp_prequeue, skb); 3946 3947 if (skb_defer_rx_timestamp(skb)) 3948 return NET_RX_SUCCESS; 3949 3950 #ifdef CONFIG_RPS 3951 if (static_key_false(&rps_needed)) { 3952 struct rps_dev_flow voidflow, *rflow = &voidflow; 3953 int cpu, ret; 3954 3955 rcu_read_lock(); 3956 3957 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3958 3959 if (cpu >= 0) { 3960 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3961 rcu_read_unlock(); 3962 return ret; 3963 } 3964 rcu_read_unlock(); 3965 } 3966 #endif 3967 return __netif_receive_skb(skb); 3968 } 3969 3970 /** 3971 * netif_receive_skb - process receive buffer from network 3972 * @skb: buffer to process 3973 * 3974 * netif_receive_skb() is the main receive data processing function. 3975 * It always succeeds. The buffer may be dropped during processing 3976 * for congestion control or by the protocol layers. 3977 * 3978 * This function may only be called from softirq context and interrupts 3979 * should be enabled. 3980 * 3981 * Return values (usually ignored): 3982 * NET_RX_SUCCESS: no congestion 3983 * NET_RX_DROP: packet was dropped 3984 */ 3985 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb) 3986 { 3987 trace_netif_receive_skb_entry(skb); 3988 3989 return netif_receive_skb_internal(skb); 3990 } 3991 EXPORT_SYMBOL(netif_receive_skb_sk); 3992 3993 /* Network device is going away, flush any packets still pending 3994 * Called with irqs disabled. 3995 */ 3996 static void flush_backlog(void *arg) 3997 { 3998 struct net_device *dev = arg; 3999 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4000 struct sk_buff *skb, *tmp; 4001 4002 rps_lock(sd); 4003 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4004 if (skb->dev == dev) { 4005 __skb_unlink(skb, &sd->input_pkt_queue); 4006 kfree_skb(skb); 4007 input_queue_head_incr(sd); 4008 } 4009 } 4010 rps_unlock(sd); 4011 4012 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4013 if (skb->dev == dev) { 4014 __skb_unlink(skb, &sd->process_queue); 4015 kfree_skb(skb); 4016 input_queue_head_incr(sd); 4017 } 4018 } 4019 } 4020 4021 static int napi_gro_complete(struct sk_buff *skb) 4022 { 4023 struct packet_offload *ptype; 4024 __be16 type = skb->protocol; 4025 struct list_head *head = &offload_base; 4026 int err = -ENOENT; 4027 4028 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 4029 4030 if (NAPI_GRO_CB(skb)->count == 1) { 4031 skb_shinfo(skb)->gso_size = 0; 4032 goto out; 4033 } 4034 4035 rcu_read_lock(); 4036 list_for_each_entry_rcu(ptype, head, list) { 4037 if (ptype->type != type || !ptype->callbacks.gro_complete) 4038 continue; 4039 4040 err = ptype->callbacks.gro_complete(skb, 0); 4041 break; 4042 } 4043 rcu_read_unlock(); 4044 4045 if (err) { 4046 WARN_ON(&ptype->list == head); 4047 kfree_skb(skb); 4048 return NET_RX_SUCCESS; 4049 } 4050 4051 out: 4052 return netif_receive_skb_internal(skb); 4053 } 4054 4055 /* napi->gro_list contains packets ordered by age. 4056 * youngest packets at the head of it. 4057 * Complete skbs in reverse order to reduce latencies. 4058 */ 4059 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 4060 { 4061 struct sk_buff *skb, *prev = NULL; 4062 4063 /* scan list and build reverse chain */ 4064 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 4065 skb->prev = prev; 4066 prev = skb; 4067 } 4068 4069 for (skb = prev; skb; skb = prev) { 4070 skb->next = NULL; 4071 4072 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 4073 return; 4074 4075 prev = skb->prev; 4076 napi_gro_complete(skb); 4077 napi->gro_count--; 4078 } 4079 4080 napi->gro_list = NULL; 4081 } 4082 EXPORT_SYMBOL(napi_gro_flush); 4083 4084 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 4085 { 4086 struct sk_buff *p; 4087 unsigned int maclen = skb->dev->hard_header_len; 4088 u32 hash = skb_get_hash_raw(skb); 4089 4090 for (p = napi->gro_list; p; p = p->next) { 4091 unsigned long diffs; 4092 4093 NAPI_GRO_CB(p)->flush = 0; 4094 4095 if (hash != skb_get_hash_raw(p)) { 4096 NAPI_GRO_CB(p)->same_flow = 0; 4097 continue; 4098 } 4099 4100 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 4101 diffs |= p->vlan_tci ^ skb->vlan_tci; 4102 if (maclen == ETH_HLEN) 4103 diffs |= compare_ether_header(skb_mac_header(p), 4104 skb_mac_header(skb)); 4105 else if (!diffs) 4106 diffs = memcmp(skb_mac_header(p), 4107 skb_mac_header(skb), 4108 maclen); 4109 NAPI_GRO_CB(p)->same_flow = !diffs; 4110 } 4111 } 4112 4113 static void skb_gro_reset_offset(struct sk_buff *skb) 4114 { 4115 const struct skb_shared_info *pinfo = skb_shinfo(skb); 4116 const skb_frag_t *frag0 = &pinfo->frags[0]; 4117 4118 NAPI_GRO_CB(skb)->data_offset = 0; 4119 NAPI_GRO_CB(skb)->frag0 = NULL; 4120 NAPI_GRO_CB(skb)->frag0_len = 0; 4121 4122 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 4123 pinfo->nr_frags && 4124 !PageHighMem(skb_frag_page(frag0))) { 4125 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 4126 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 4127 } 4128 } 4129 4130 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 4131 { 4132 struct skb_shared_info *pinfo = skb_shinfo(skb); 4133 4134 BUG_ON(skb->end - skb->tail < grow); 4135 4136 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 4137 4138 skb->data_len -= grow; 4139 skb->tail += grow; 4140 4141 pinfo->frags[0].page_offset += grow; 4142 skb_frag_size_sub(&pinfo->frags[0], grow); 4143 4144 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 4145 skb_frag_unref(skb, 0); 4146 memmove(pinfo->frags, pinfo->frags + 1, 4147 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 4148 } 4149 } 4150 4151 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4152 { 4153 struct sk_buff **pp = NULL; 4154 struct packet_offload *ptype; 4155 __be16 type = skb->protocol; 4156 struct list_head *head = &offload_base; 4157 int same_flow; 4158 enum gro_result ret; 4159 int grow; 4160 4161 if (!(skb->dev->features & NETIF_F_GRO)) 4162 goto normal; 4163 4164 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4165 goto normal; 4166 4167 gro_list_prepare(napi, skb); 4168 4169 rcu_read_lock(); 4170 list_for_each_entry_rcu(ptype, head, list) { 4171 if (ptype->type != type || !ptype->callbacks.gro_receive) 4172 continue; 4173 4174 skb_set_network_header(skb, skb_gro_offset(skb)); 4175 skb_reset_mac_len(skb); 4176 NAPI_GRO_CB(skb)->same_flow = 0; 4177 NAPI_GRO_CB(skb)->flush = 0; 4178 NAPI_GRO_CB(skb)->free = 0; 4179 NAPI_GRO_CB(skb)->udp_mark = 0; 4180 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4181 4182 /* Setup for GRO checksum validation */ 4183 switch (skb->ip_summed) { 4184 case CHECKSUM_COMPLETE: 4185 NAPI_GRO_CB(skb)->csum = skb->csum; 4186 NAPI_GRO_CB(skb)->csum_valid = 1; 4187 NAPI_GRO_CB(skb)->csum_cnt = 0; 4188 break; 4189 case CHECKSUM_UNNECESSARY: 4190 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4191 NAPI_GRO_CB(skb)->csum_valid = 0; 4192 break; 4193 default: 4194 NAPI_GRO_CB(skb)->csum_cnt = 0; 4195 NAPI_GRO_CB(skb)->csum_valid = 0; 4196 } 4197 4198 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4199 break; 4200 } 4201 rcu_read_unlock(); 4202 4203 if (&ptype->list == head) 4204 goto normal; 4205 4206 same_flow = NAPI_GRO_CB(skb)->same_flow; 4207 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4208 4209 if (pp) { 4210 struct sk_buff *nskb = *pp; 4211 4212 *pp = nskb->next; 4213 nskb->next = NULL; 4214 napi_gro_complete(nskb); 4215 napi->gro_count--; 4216 } 4217 4218 if (same_flow) 4219 goto ok; 4220 4221 if (NAPI_GRO_CB(skb)->flush) 4222 goto normal; 4223 4224 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4225 struct sk_buff *nskb = napi->gro_list; 4226 4227 /* locate the end of the list to select the 'oldest' flow */ 4228 while (nskb->next) { 4229 pp = &nskb->next; 4230 nskb = *pp; 4231 } 4232 *pp = NULL; 4233 nskb->next = NULL; 4234 napi_gro_complete(nskb); 4235 } else { 4236 napi->gro_count++; 4237 } 4238 NAPI_GRO_CB(skb)->count = 1; 4239 NAPI_GRO_CB(skb)->age = jiffies; 4240 NAPI_GRO_CB(skb)->last = skb; 4241 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4242 skb->next = napi->gro_list; 4243 napi->gro_list = skb; 4244 ret = GRO_HELD; 4245 4246 pull: 4247 grow = skb_gro_offset(skb) - skb_headlen(skb); 4248 if (grow > 0) 4249 gro_pull_from_frag0(skb, grow); 4250 ok: 4251 return ret; 4252 4253 normal: 4254 ret = GRO_NORMAL; 4255 goto pull; 4256 } 4257 4258 struct packet_offload *gro_find_receive_by_type(__be16 type) 4259 { 4260 struct list_head *offload_head = &offload_base; 4261 struct packet_offload *ptype; 4262 4263 list_for_each_entry_rcu(ptype, offload_head, list) { 4264 if (ptype->type != type || !ptype->callbacks.gro_receive) 4265 continue; 4266 return ptype; 4267 } 4268 return NULL; 4269 } 4270 EXPORT_SYMBOL(gro_find_receive_by_type); 4271 4272 struct packet_offload *gro_find_complete_by_type(__be16 type) 4273 { 4274 struct list_head *offload_head = &offload_base; 4275 struct packet_offload *ptype; 4276 4277 list_for_each_entry_rcu(ptype, offload_head, list) { 4278 if (ptype->type != type || !ptype->callbacks.gro_complete) 4279 continue; 4280 return ptype; 4281 } 4282 return NULL; 4283 } 4284 EXPORT_SYMBOL(gro_find_complete_by_type); 4285 4286 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4287 { 4288 switch (ret) { 4289 case GRO_NORMAL: 4290 if (netif_receive_skb_internal(skb)) 4291 ret = GRO_DROP; 4292 break; 4293 4294 case GRO_DROP: 4295 kfree_skb(skb); 4296 break; 4297 4298 case GRO_MERGED_FREE: 4299 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) 4300 kmem_cache_free(skbuff_head_cache, skb); 4301 else 4302 __kfree_skb(skb); 4303 break; 4304 4305 case GRO_HELD: 4306 case GRO_MERGED: 4307 break; 4308 } 4309 4310 return ret; 4311 } 4312 4313 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4314 { 4315 trace_napi_gro_receive_entry(skb); 4316 4317 skb_gro_reset_offset(skb); 4318 4319 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4320 } 4321 EXPORT_SYMBOL(napi_gro_receive); 4322 4323 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4324 { 4325 if (unlikely(skb->pfmemalloc)) { 4326 consume_skb(skb); 4327 return; 4328 } 4329 __skb_pull(skb, skb_headlen(skb)); 4330 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4331 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4332 skb->vlan_tci = 0; 4333 skb->dev = napi->dev; 4334 skb->skb_iif = 0; 4335 skb->encapsulation = 0; 4336 skb_shinfo(skb)->gso_type = 0; 4337 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4338 4339 napi->skb = skb; 4340 } 4341 4342 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4343 { 4344 struct sk_buff *skb = napi->skb; 4345 4346 if (!skb) { 4347 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4348 napi->skb = skb; 4349 } 4350 return skb; 4351 } 4352 EXPORT_SYMBOL(napi_get_frags); 4353 4354 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4355 struct sk_buff *skb, 4356 gro_result_t ret) 4357 { 4358 switch (ret) { 4359 case GRO_NORMAL: 4360 case GRO_HELD: 4361 __skb_push(skb, ETH_HLEN); 4362 skb->protocol = eth_type_trans(skb, skb->dev); 4363 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4364 ret = GRO_DROP; 4365 break; 4366 4367 case GRO_DROP: 4368 case GRO_MERGED_FREE: 4369 napi_reuse_skb(napi, skb); 4370 break; 4371 4372 case GRO_MERGED: 4373 break; 4374 } 4375 4376 return ret; 4377 } 4378 4379 /* Upper GRO stack assumes network header starts at gro_offset=0 4380 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4381 * We copy ethernet header into skb->data to have a common layout. 4382 */ 4383 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4384 { 4385 struct sk_buff *skb = napi->skb; 4386 const struct ethhdr *eth; 4387 unsigned int hlen = sizeof(*eth); 4388 4389 napi->skb = NULL; 4390 4391 skb_reset_mac_header(skb); 4392 skb_gro_reset_offset(skb); 4393 4394 eth = skb_gro_header_fast(skb, 0); 4395 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4396 eth = skb_gro_header_slow(skb, hlen, 0); 4397 if (unlikely(!eth)) { 4398 napi_reuse_skb(napi, skb); 4399 return NULL; 4400 } 4401 } else { 4402 gro_pull_from_frag0(skb, hlen); 4403 NAPI_GRO_CB(skb)->frag0 += hlen; 4404 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4405 } 4406 __skb_pull(skb, hlen); 4407 4408 /* 4409 * This works because the only protocols we care about don't require 4410 * special handling. 4411 * We'll fix it up properly in napi_frags_finish() 4412 */ 4413 skb->protocol = eth->h_proto; 4414 4415 return skb; 4416 } 4417 4418 gro_result_t napi_gro_frags(struct napi_struct *napi) 4419 { 4420 struct sk_buff *skb = napi_frags_skb(napi); 4421 4422 if (!skb) 4423 return GRO_DROP; 4424 4425 trace_napi_gro_frags_entry(skb); 4426 4427 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4428 } 4429 EXPORT_SYMBOL(napi_gro_frags); 4430 4431 /* Compute the checksum from gro_offset and return the folded value 4432 * after adding in any pseudo checksum. 4433 */ 4434 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4435 { 4436 __wsum wsum; 4437 __sum16 sum; 4438 4439 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4440 4441 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4442 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4443 if (likely(!sum)) { 4444 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4445 !skb->csum_complete_sw) 4446 netdev_rx_csum_fault(skb->dev); 4447 } 4448 4449 NAPI_GRO_CB(skb)->csum = wsum; 4450 NAPI_GRO_CB(skb)->csum_valid = 1; 4451 4452 return sum; 4453 } 4454 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4455 4456 /* 4457 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4458 * Note: called with local irq disabled, but exits with local irq enabled. 4459 */ 4460 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4461 { 4462 #ifdef CONFIG_RPS 4463 struct softnet_data *remsd = sd->rps_ipi_list; 4464 4465 if (remsd) { 4466 sd->rps_ipi_list = NULL; 4467 4468 local_irq_enable(); 4469 4470 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4471 while (remsd) { 4472 struct softnet_data *next = remsd->rps_ipi_next; 4473 4474 if (cpu_online(remsd->cpu)) 4475 smp_call_function_single_async(remsd->cpu, 4476 &remsd->csd); 4477 remsd = next; 4478 } 4479 } else 4480 #endif 4481 local_irq_enable(); 4482 } 4483 4484 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4485 { 4486 #ifdef CONFIG_RPS 4487 return sd->rps_ipi_list != NULL; 4488 #else 4489 return false; 4490 #endif 4491 } 4492 4493 static int process_backlog(struct napi_struct *napi, int quota) 4494 { 4495 int work = 0; 4496 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4497 4498 /* Check if we have pending ipi, its better to send them now, 4499 * not waiting net_rx_action() end. 4500 */ 4501 if (sd_has_rps_ipi_waiting(sd)) { 4502 local_irq_disable(); 4503 net_rps_action_and_irq_enable(sd); 4504 } 4505 4506 napi->weight = weight_p; 4507 local_irq_disable(); 4508 while (1) { 4509 struct sk_buff *skb; 4510 4511 while ((skb = __skb_dequeue(&sd->process_queue))) { 4512 local_irq_enable(); 4513 __netif_receive_skb(skb); 4514 local_irq_disable(); 4515 input_queue_head_incr(sd); 4516 if (++work >= quota) { 4517 local_irq_enable(); 4518 return work; 4519 } 4520 } 4521 4522 rps_lock(sd); 4523 if (skb_queue_empty(&sd->input_pkt_queue)) { 4524 /* 4525 * Inline a custom version of __napi_complete(). 4526 * only current cpu owns and manipulates this napi, 4527 * and NAPI_STATE_SCHED is the only possible flag set 4528 * on backlog. 4529 * We can use a plain write instead of clear_bit(), 4530 * and we dont need an smp_mb() memory barrier. 4531 */ 4532 napi->state = 0; 4533 rps_unlock(sd); 4534 4535 break; 4536 } 4537 4538 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4539 &sd->process_queue); 4540 rps_unlock(sd); 4541 } 4542 local_irq_enable(); 4543 4544 return work; 4545 } 4546 4547 /** 4548 * __napi_schedule - schedule for receive 4549 * @n: entry to schedule 4550 * 4551 * The entry's receive function will be scheduled to run. 4552 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4553 */ 4554 void __napi_schedule(struct napi_struct *n) 4555 { 4556 unsigned long flags; 4557 4558 local_irq_save(flags); 4559 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4560 local_irq_restore(flags); 4561 } 4562 EXPORT_SYMBOL(__napi_schedule); 4563 4564 /** 4565 * __napi_schedule_irqoff - schedule for receive 4566 * @n: entry to schedule 4567 * 4568 * Variant of __napi_schedule() assuming hard irqs are masked 4569 */ 4570 void __napi_schedule_irqoff(struct napi_struct *n) 4571 { 4572 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4573 } 4574 EXPORT_SYMBOL(__napi_schedule_irqoff); 4575 4576 void __napi_complete(struct napi_struct *n) 4577 { 4578 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4579 4580 list_del_init(&n->poll_list); 4581 smp_mb__before_atomic(); 4582 clear_bit(NAPI_STATE_SCHED, &n->state); 4583 } 4584 EXPORT_SYMBOL(__napi_complete); 4585 4586 void napi_complete_done(struct napi_struct *n, int work_done) 4587 { 4588 unsigned long flags; 4589 4590 /* 4591 * don't let napi dequeue from the cpu poll list 4592 * just in case its running on a different cpu 4593 */ 4594 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4595 return; 4596 4597 if (n->gro_list) { 4598 unsigned long timeout = 0; 4599 4600 if (work_done) 4601 timeout = n->dev->gro_flush_timeout; 4602 4603 if (timeout) 4604 hrtimer_start(&n->timer, ns_to_ktime(timeout), 4605 HRTIMER_MODE_REL_PINNED); 4606 else 4607 napi_gro_flush(n, false); 4608 } 4609 if (likely(list_empty(&n->poll_list))) { 4610 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4611 } else { 4612 /* If n->poll_list is not empty, we need to mask irqs */ 4613 local_irq_save(flags); 4614 __napi_complete(n); 4615 local_irq_restore(flags); 4616 } 4617 } 4618 EXPORT_SYMBOL(napi_complete_done); 4619 4620 /* must be called under rcu_read_lock(), as we dont take a reference */ 4621 struct napi_struct *napi_by_id(unsigned int napi_id) 4622 { 4623 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 4624 struct napi_struct *napi; 4625 4626 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 4627 if (napi->napi_id == napi_id) 4628 return napi; 4629 4630 return NULL; 4631 } 4632 EXPORT_SYMBOL_GPL(napi_by_id); 4633 4634 void napi_hash_add(struct napi_struct *napi) 4635 { 4636 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { 4637 4638 spin_lock(&napi_hash_lock); 4639 4640 /* 0 is not a valid id, we also skip an id that is taken 4641 * we expect both events to be extremely rare 4642 */ 4643 napi->napi_id = 0; 4644 while (!napi->napi_id) { 4645 napi->napi_id = ++napi_gen_id; 4646 if (napi_by_id(napi->napi_id)) 4647 napi->napi_id = 0; 4648 } 4649 4650 hlist_add_head_rcu(&napi->napi_hash_node, 4651 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 4652 4653 spin_unlock(&napi_hash_lock); 4654 } 4655 } 4656 EXPORT_SYMBOL_GPL(napi_hash_add); 4657 4658 /* Warning : caller is responsible to make sure rcu grace period 4659 * is respected before freeing memory containing @napi 4660 */ 4661 void napi_hash_del(struct napi_struct *napi) 4662 { 4663 spin_lock(&napi_hash_lock); 4664 4665 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) 4666 hlist_del_rcu(&napi->napi_hash_node); 4667 4668 spin_unlock(&napi_hash_lock); 4669 } 4670 EXPORT_SYMBOL_GPL(napi_hash_del); 4671 4672 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 4673 { 4674 struct napi_struct *napi; 4675 4676 napi = container_of(timer, struct napi_struct, timer); 4677 if (napi->gro_list) 4678 napi_schedule(napi); 4679 4680 return HRTIMER_NORESTART; 4681 } 4682 4683 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 4684 int (*poll)(struct napi_struct *, int), int weight) 4685 { 4686 INIT_LIST_HEAD(&napi->poll_list); 4687 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 4688 napi->timer.function = napi_watchdog; 4689 napi->gro_count = 0; 4690 napi->gro_list = NULL; 4691 napi->skb = NULL; 4692 napi->poll = poll; 4693 if (weight > NAPI_POLL_WEIGHT) 4694 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 4695 weight, dev->name); 4696 napi->weight = weight; 4697 list_add(&napi->dev_list, &dev->napi_list); 4698 napi->dev = dev; 4699 #ifdef CONFIG_NETPOLL 4700 spin_lock_init(&napi->poll_lock); 4701 napi->poll_owner = -1; 4702 #endif 4703 set_bit(NAPI_STATE_SCHED, &napi->state); 4704 } 4705 EXPORT_SYMBOL(netif_napi_add); 4706 4707 void napi_disable(struct napi_struct *n) 4708 { 4709 might_sleep(); 4710 set_bit(NAPI_STATE_DISABLE, &n->state); 4711 4712 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 4713 msleep(1); 4714 4715 hrtimer_cancel(&n->timer); 4716 4717 clear_bit(NAPI_STATE_DISABLE, &n->state); 4718 } 4719 EXPORT_SYMBOL(napi_disable); 4720 4721 void netif_napi_del(struct napi_struct *napi) 4722 { 4723 list_del_init(&napi->dev_list); 4724 napi_free_frags(napi); 4725 4726 kfree_skb_list(napi->gro_list); 4727 napi->gro_list = NULL; 4728 napi->gro_count = 0; 4729 } 4730 EXPORT_SYMBOL(netif_napi_del); 4731 4732 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 4733 { 4734 void *have; 4735 int work, weight; 4736 4737 list_del_init(&n->poll_list); 4738 4739 have = netpoll_poll_lock(n); 4740 4741 weight = n->weight; 4742 4743 /* This NAPI_STATE_SCHED test is for avoiding a race 4744 * with netpoll's poll_napi(). Only the entity which 4745 * obtains the lock and sees NAPI_STATE_SCHED set will 4746 * actually make the ->poll() call. Therefore we avoid 4747 * accidentally calling ->poll() when NAPI is not scheduled. 4748 */ 4749 work = 0; 4750 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 4751 work = n->poll(n, weight); 4752 trace_napi_poll(n); 4753 } 4754 4755 WARN_ON_ONCE(work > weight); 4756 4757 if (likely(work < weight)) 4758 goto out_unlock; 4759 4760 /* Drivers must not modify the NAPI state if they 4761 * consume the entire weight. In such cases this code 4762 * still "owns" the NAPI instance and therefore can 4763 * move the instance around on the list at-will. 4764 */ 4765 if (unlikely(napi_disable_pending(n))) { 4766 napi_complete(n); 4767 goto out_unlock; 4768 } 4769 4770 if (n->gro_list) { 4771 /* flush too old packets 4772 * If HZ < 1000, flush all packets. 4773 */ 4774 napi_gro_flush(n, HZ >= 1000); 4775 } 4776 4777 /* Some drivers may have called napi_schedule 4778 * prior to exhausting their budget. 4779 */ 4780 if (unlikely(!list_empty(&n->poll_list))) { 4781 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 4782 n->dev ? n->dev->name : "backlog"); 4783 goto out_unlock; 4784 } 4785 4786 list_add_tail(&n->poll_list, repoll); 4787 4788 out_unlock: 4789 netpoll_poll_unlock(have); 4790 4791 return work; 4792 } 4793 4794 static void net_rx_action(struct softirq_action *h) 4795 { 4796 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4797 unsigned long time_limit = jiffies + 2; 4798 int budget = netdev_budget; 4799 LIST_HEAD(list); 4800 LIST_HEAD(repoll); 4801 4802 local_irq_disable(); 4803 list_splice_init(&sd->poll_list, &list); 4804 local_irq_enable(); 4805 4806 for (;;) { 4807 struct napi_struct *n; 4808 4809 if (list_empty(&list)) { 4810 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 4811 return; 4812 break; 4813 } 4814 4815 n = list_first_entry(&list, struct napi_struct, poll_list); 4816 budget -= napi_poll(n, &repoll); 4817 4818 /* If softirq window is exhausted then punt. 4819 * Allow this to run for 2 jiffies since which will allow 4820 * an average latency of 1.5/HZ. 4821 */ 4822 if (unlikely(budget <= 0 || 4823 time_after_eq(jiffies, time_limit))) { 4824 sd->time_squeeze++; 4825 break; 4826 } 4827 } 4828 4829 local_irq_disable(); 4830 4831 list_splice_tail_init(&sd->poll_list, &list); 4832 list_splice_tail(&repoll, &list); 4833 list_splice(&list, &sd->poll_list); 4834 if (!list_empty(&sd->poll_list)) 4835 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 4836 4837 net_rps_action_and_irq_enable(sd); 4838 } 4839 4840 struct netdev_adjacent { 4841 struct net_device *dev; 4842 4843 /* upper master flag, there can only be one master device per list */ 4844 bool master; 4845 4846 /* counter for the number of times this device was added to us */ 4847 u16 ref_nr; 4848 4849 /* private field for the users */ 4850 void *private; 4851 4852 struct list_head list; 4853 struct rcu_head rcu; 4854 }; 4855 4856 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, 4857 struct net_device *adj_dev, 4858 struct list_head *adj_list) 4859 { 4860 struct netdev_adjacent *adj; 4861 4862 list_for_each_entry(adj, adj_list, list) { 4863 if (adj->dev == adj_dev) 4864 return adj; 4865 } 4866 return NULL; 4867 } 4868 4869 /** 4870 * netdev_has_upper_dev - Check if device is linked to an upper device 4871 * @dev: device 4872 * @upper_dev: upper device to check 4873 * 4874 * Find out if a device is linked to specified upper device and return true 4875 * in case it is. Note that this checks only immediate upper device, 4876 * not through a complete stack of devices. The caller must hold the RTNL lock. 4877 */ 4878 bool netdev_has_upper_dev(struct net_device *dev, 4879 struct net_device *upper_dev) 4880 { 4881 ASSERT_RTNL(); 4882 4883 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); 4884 } 4885 EXPORT_SYMBOL(netdev_has_upper_dev); 4886 4887 /** 4888 * netdev_has_any_upper_dev - Check if device is linked to some device 4889 * @dev: device 4890 * 4891 * Find out if a device is linked to an upper device and return true in case 4892 * it is. The caller must hold the RTNL lock. 4893 */ 4894 static bool netdev_has_any_upper_dev(struct net_device *dev) 4895 { 4896 ASSERT_RTNL(); 4897 4898 return !list_empty(&dev->all_adj_list.upper); 4899 } 4900 4901 /** 4902 * netdev_master_upper_dev_get - Get master upper device 4903 * @dev: device 4904 * 4905 * Find a master upper device and return pointer to it or NULL in case 4906 * it's not there. The caller must hold the RTNL lock. 4907 */ 4908 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 4909 { 4910 struct netdev_adjacent *upper; 4911 4912 ASSERT_RTNL(); 4913 4914 if (list_empty(&dev->adj_list.upper)) 4915 return NULL; 4916 4917 upper = list_first_entry(&dev->adj_list.upper, 4918 struct netdev_adjacent, list); 4919 if (likely(upper->master)) 4920 return upper->dev; 4921 return NULL; 4922 } 4923 EXPORT_SYMBOL(netdev_master_upper_dev_get); 4924 4925 void *netdev_adjacent_get_private(struct list_head *adj_list) 4926 { 4927 struct netdev_adjacent *adj; 4928 4929 adj = list_entry(adj_list, struct netdev_adjacent, list); 4930 4931 return adj->private; 4932 } 4933 EXPORT_SYMBOL(netdev_adjacent_get_private); 4934 4935 /** 4936 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 4937 * @dev: device 4938 * @iter: list_head ** of the current position 4939 * 4940 * Gets the next device from the dev's upper list, starting from iter 4941 * position. The caller must hold RCU read lock. 4942 */ 4943 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 4944 struct list_head **iter) 4945 { 4946 struct netdev_adjacent *upper; 4947 4948 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 4949 4950 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4951 4952 if (&upper->list == &dev->adj_list.upper) 4953 return NULL; 4954 4955 *iter = &upper->list; 4956 4957 return upper->dev; 4958 } 4959 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 4960 4961 /** 4962 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 4963 * @dev: device 4964 * @iter: list_head ** of the current position 4965 * 4966 * Gets the next device from the dev's upper list, starting from iter 4967 * position. The caller must hold RCU read lock. 4968 */ 4969 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, 4970 struct list_head **iter) 4971 { 4972 struct netdev_adjacent *upper; 4973 4974 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 4975 4976 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4977 4978 if (&upper->list == &dev->all_adj_list.upper) 4979 return NULL; 4980 4981 *iter = &upper->list; 4982 4983 return upper->dev; 4984 } 4985 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 4986 4987 /** 4988 * netdev_lower_get_next_private - Get the next ->private from the 4989 * lower neighbour list 4990 * @dev: device 4991 * @iter: list_head ** of the current position 4992 * 4993 * Gets the next netdev_adjacent->private from the dev's lower neighbour 4994 * list, starting from iter position. The caller must hold either hold the 4995 * RTNL lock or its own locking that guarantees that the neighbour lower 4996 * list will remain unchainged. 4997 */ 4998 void *netdev_lower_get_next_private(struct net_device *dev, 4999 struct list_head **iter) 5000 { 5001 struct netdev_adjacent *lower; 5002 5003 lower = list_entry(*iter, struct netdev_adjacent, list); 5004 5005 if (&lower->list == &dev->adj_list.lower) 5006 return NULL; 5007 5008 *iter = lower->list.next; 5009 5010 return lower->private; 5011 } 5012 EXPORT_SYMBOL(netdev_lower_get_next_private); 5013 5014 /** 5015 * netdev_lower_get_next_private_rcu - Get the next ->private from the 5016 * lower neighbour list, RCU 5017 * variant 5018 * @dev: device 5019 * @iter: list_head ** of the current position 5020 * 5021 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5022 * list, starting from iter position. The caller must hold RCU read lock. 5023 */ 5024 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 5025 struct list_head **iter) 5026 { 5027 struct netdev_adjacent *lower; 5028 5029 WARN_ON_ONCE(!rcu_read_lock_held()); 5030 5031 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5032 5033 if (&lower->list == &dev->adj_list.lower) 5034 return NULL; 5035 5036 *iter = &lower->list; 5037 5038 return lower->private; 5039 } 5040 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 5041 5042 /** 5043 * netdev_lower_get_next - Get the next device from the lower neighbour 5044 * list 5045 * @dev: device 5046 * @iter: list_head ** of the current position 5047 * 5048 * Gets the next netdev_adjacent from the dev's lower neighbour 5049 * list, starting from iter position. The caller must hold RTNL lock or 5050 * its own locking that guarantees that the neighbour lower 5051 * list will remain unchainged. 5052 */ 5053 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 5054 { 5055 struct netdev_adjacent *lower; 5056 5057 lower = list_entry((*iter)->next, struct netdev_adjacent, list); 5058 5059 if (&lower->list == &dev->adj_list.lower) 5060 return NULL; 5061 5062 *iter = &lower->list; 5063 5064 return lower->dev; 5065 } 5066 EXPORT_SYMBOL(netdev_lower_get_next); 5067 5068 /** 5069 * netdev_lower_get_first_private_rcu - Get the first ->private from the 5070 * lower neighbour list, RCU 5071 * variant 5072 * @dev: device 5073 * 5074 * Gets the first netdev_adjacent->private from the dev's lower neighbour 5075 * list. The caller must hold RCU read lock. 5076 */ 5077 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 5078 { 5079 struct netdev_adjacent *lower; 5080 5081 lower = list_first_or_null_rcu(&dev->adj_list.lower, 5082 struct netdev_adjacent, list); 5083 if (lower) 5084 return lower->private; 5085 return NULL; 5086 } 5087 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 5088 5089 /** 5090 * netdev_master_upper_dev_get_rcu - Get master upper device 5091 * @dev: device 5092 * 5093 * Find a master upper device and return pointer to it or NULL in case 5094 * it's not there. The caller must hold the RCU read lock. 5095 */ 5096 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 5097 { 5098 struct netdev_adjacent *upper; 5099 5100 upper = list_first_or_null_rcu(&dev->adj_list.upper, 5101 struct netdev_adjacent, list); 5102 if (upper && likely(upper->master)) 5103 return upper->dev; 5104 return NULL; 5105 } 5106 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 5107 5108 static int netdev_adjacent_sysfs_add(struct net_device *dev, 5109 struct net_device *adj_dev, 5110 struct list_head *dev_list) 5111 { 5112 char linkname[IFNAMSIZ+7]; 5113 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5114 "upper_%s" : "lower_%s", adj_dev->name); 5115 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5116 linkname); 5117 } 5118 static void netdev_adjacent_sysfs_del(struct net_device *dev, 5119 char *name, 5120 struct list_head *dev_list) 5121 { 5122 char linkname[IFNAMSIZ+7]; 5123 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5124 "upper_%s" : "lower_%s", name); 5125 sysfs_remove_link(&(dev->dev.kobj), linkname); 5126 } 5127 5128 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 5129 struct net_device *adj_dev, 5130 struct list_head *dev_list) 5131 { 5132 return (dev_list == &dev->adj_list.upper || 5133 dev_list == &dev->adj_list.lower) && 5134 net_eq(dev_net(dev), dev_net(adj_dev)); 5135 } 5136 5137 static int __netdev_adjacent_dev_insert(struct net_device *dev, 5138 struct net_device *adj_dev, 5139 struct list_head *dev_list, 5140 void *private, bool master) 5141 { 5142 struct netdev_adjacent *adj; 5143 int ret; 5144 5145 adj = __netdev_find_adj(dev, adj_dev, dev_list); 5146 5147 if (adj) { 5148 adj->ref_nr++; 5149 return 0; 5150 } 5151 5152 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5153 if (!adj) 5154 return -ENOMEM; 5155 5156 adj->dev = adj_dev; 5157 adj->master = master; 5158 adj->ref_nr = 1; 5159 adj->private = private; 5160 dev_hold(adj_dev); 5161 5162 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5163 adj_dev->name, dev->name, adj_dev->name); 5164 5165 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5166 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5167 if (ret) 5168 goto free_adj; 5169 } 5170 5171 /* Ensure that master link is always the first item in list. */ 5172 if (master) { 5173 ret = sysfs_create_link(&(dev->dev.kobj), 5174 &(adj_dev->dev.kobj), "master"); 5175 if (ret) 5176 goto remove_symlinks; 5177 5178 list_add_rcu(&adj->list, dev_list); 5179 } else { 5180 list_add_tail_rcu(&adj->list, dev_list); 5181 } 5182 5183 return 0; 5184 5185 remove_symlinks: 5186 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5187 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5188 free_adj: 5189 kfree(adj); 5190 dev_put(adj_dev); 5191 5192 return ret; 5193 } 5194 5195 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5196 struct net_device *adj_dev, 5197 struct list_head *dev_list) 5198 { 5199 struct netdev_adjacent *adj; 5200 5201 adj = __netdev_find_adj(dev, adj_dev, dev_list); 5202 5203 if (!adj) { 5204 pr_err("tried to remove device %s from %s\n", 5205 dev->name, adj_dev->name); 5206 BUG(); 5207 } 5208 5209 if (adj->ref_nr > 1) { 5210 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, 5211 adj->ref_nr-1); 5212 adj->ref_nr--; 5213 return; 5214 } 5215 5216 if (adj->master) 5217 sysfs_remove_link(&(dev->dev.kobj), "master"); 5218 5219 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5220 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5221 5222 list_del_rcu(&adj->list); 5223 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5224 adj_dev->name, dev->name, adj_dev->name); 5225 dev_put(adj_dev); 5226 kfree_rcu(adj, rcu); 5227 } 5228 5229 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5230 struct net_device *upper_dev, 5231 struct list_head *up_list, 5232 struct list_head *down_list, 5233 void *private, bool master) 5234 { 5235 int ret; 5236 5237 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, 5238 master); 5239 if (ret) 5240 return ret; 5241 5242 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, 5243 false); 5244 if (ret) { 5245 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5246 return ret; 5247 } 5248 5249 return 0; 5250 } 5251 5252 static int __netdev_adjacent_dev_link(struct net_device *dev, 5253 struct net_device *upper_dev) 5254 { 5255 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 5256 &dev->all_adj_list.upper, 5257 &upper_dev->all_adj_list.lower, 5258 NULL, false); 5259 } 5260 5261 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5262 struct net_device *upper_dev, 5263 struct list_head *up_list, 5264 struct list_head *down_list) 5265 { 5266 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5267 __netdev_adjacent_dev_remove(upper_dev, dev, down_list); 5268 } 5269 5270 static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5271 struct net_device *upper_dev) 5272 { 5273 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5274 &dev->all_adj_list.upper, 5275 &upper_dev->all_adj_list.lower); 5276 } 5277 5278 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5279 struct net_device *upper_dev, 5280 void *private, bool master) 5281 { 5282 int ret = __netdev_adjacent_dev_link(dev, upper_dev); 5283 5284 if (ret) 5285 return ret; 5286 5287 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 5288 &dev->adj_list.upper, 5289 &upper_dev->adj_list.lower, 5290 private, master); 5291 if (ret) { 5292 __netdev_adjacent_dev_unlink(dev, upper_dev); 5293 return ret; 5294 } 5295 5296 return 0; 5297 } 5298 5299 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5300 struct net_device *upper_dev) 5301 { 5302 __netdev_adjacent_dev_unlink(dev, upper_dev); 5303 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5304 &dev->adj_list.upper, 5305 &upper_dev->adj_list.lower); 5306 } 5307 5308 static int __netdev_upper_dev_link(struct net_device *dev, 5309 struct net_device *upper_dev, bool master, 5310 void *private) 5311 { 5312 struct netdev_adjacent *i, *j, *to_i, *to_j; 5313 int ret = 0; 5314 5315 ASSERT_RTNL(); 5316 5317 if (dev == upper_dev) 5318 return -EBUSY; 5319 5320 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5321 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) 5322 return -EBUSY; 5323 5324 if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper)) 5325 return -EEXIST; 5326 5327 if (master && netdev_master_upper_dev_get(dev)) 5328 return -EBUSY; 5329 5330 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, 5331 master); 5332 if (ret) 5333 return ret; 5334 5335 /* Now that we linked these devs, make all the upper_dev's 5336 * all_adj_list.upper visible to every dev's all_adj_list.lower an 5337 * versa, and don't forget the devices itself. All of these 5338 * links are non-neighbours. 5339 */ 5340 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5341 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5342 pr_debug("Interlinking %s with %s, non-neighbour\n", 5343 i->dev->name, j->dev->name); 5344 ret = __netdev_adjacent_dev_link(i->dev, j->dev); 5345 if (ret) 5346 goto rollback_mesh; 5347 } 5348 } 5349 5350 /* add dev to every upper_dev's upper device */ 5351 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5352 pr_debug("linking %s's upper device %s with %s\n", 5353 upper_dev->name, i->dev->name, dev->name); 5354 ret = __netdev_adjacent_dev_link(dev, i->dev); 5355 if (ret) 5356 goto rollback_upper_mesh; 5357 } 5358 5359 /* add upper_dev to every dev's lower device */ 5360 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5361 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5362 i->dev->name, upper_dev->name); 5363 ret = __netdev_adjacent_dev_link(i->dev, upper_dev); 5364 if (ret) 5365 goto rollback_lower_mesh; 5366 } 5367 5368 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); 5369 return 0; 5370 5371 rollback_lower_mesh: 5372 to_i = i; 5373 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5374 if (i == to_i) 5375 break; 5376 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5377 } 5378 5379 i = NULL; 5380 5381 rollback_upper_mesh: 5382 to_i = i; 5383 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5384 if (i == to_i) 5385 break; 5386 __netdev_adjacent_dev_unlink(dev, i->dev); 5387 } 5388 5389 i = j = NULL; 5390 5391 rollback_mesh: 5392 to_i = i; 5393 to_j = j; 5394 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5395 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5396 if (i == to_i && j == to_j) 5397 break; 5398 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5399 } 5400 if (i == to_i) 5401 break; 5402 } 5403 5404 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5405 5406 return ret; 5407 } 5408 5409 /** 5410 * netdev_upper_dev_link - Add a link to the upper device 5411 * @dev: device 5412 * @upper_dev: new upper device 5413 * 5414 * Adds a link to device which is upper to this one. The caller must hold 5415 * the RTNL lock. On a failure a negative errno code is returned. 5416 * On success the reference counts are adjusted and the function 5417 * returns zero. 5418 */ 5419 int netdev_upper_dev_link(struct net_device *dev, 5420 struct net_device *upper_dev) 5421 { 5422 return __netdev_upper_dev_link(dev, upper_dev, false, NULL); 5423 } 5424 EXPORT_SYMBOL(netdev_upper_dev_link); 5425 5426 /** 5427 * netdev_master_upper_dev_link - Add a master link to the upper device 5428 * @dev: device 5429 * @upper_dev: new upper device 5430 * 5431 * Adds a link to device which is upper to this one. In this case, only 5432 * one master upper device can be linked, although other non-master devices 5433 * might be linked as well. The caller must hold the RTNL lock. 5434 * On a failure a negative errno code is returned. On success the reference 5435 * counts are adjusted and the function returns zero. 5436 */ 5437 int netdev_master_upper_dev_link(struct net_device *dev, 5438 struct net_device *upper_dev) 5439 { 5440 return __netdev_upper_dev_link(dev, upper_dev, true, NULL); 5441 } 5442 EXPORT_SYMBOL(netdev_master_upper_dev_link); 5443 5444 int netdev_master_upper_dev_link_private(struct net_device *dev, 5445 struct net_device *upper_dev, 5446 void *private) 5447 { 5448 return __netdev_upper_dev_link(dev, upper_dev, true, private); 5449 } 5450 EXPORT_SYMBOL(netdev_master_upper_dev_link_private); 5451 5452 /** 5453 * netdev_upper_dev_unlink - Removes a link to upper device 5454 * @dev: device 5455 * @upper_dev: new upper device 5456 * 5457 * Removes a link to device which is upper to this one. The caller must hold 5458 * the RTNL lock. 5459 */ 5460 void netdev_upper_dev_unlink(struct net_device *dev, 5461 struct net_device *upper_dev) 5462 { 5463 struct netdev_adjacent *i, *j; 5464 ASSERT_RTNL(); 5465 5466 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5467 5468 /* Here is the tricky part. We must remove all dev's lower 5469 * devices from all upper_dev's upper devices and vice 5470 * versa, to maintain the graph relationship. 5471 */ 5472 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5473 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 5474 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5475 5476 /* remove also the devices itself from lower/upper device 5477 * list 5478 */ 5479 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5480 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5481 5482 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 5483 __netdev_adjacent_dev_unlink(dev, i->dev); 5484 5485 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); 5486 } 5487 EXPORT_SYMBOL(netdev_upper_dev_unlink); 5488 5489 /** 5490 * netdev_bonding_info_change - Dispatch event about slave change 5491 * @dev: device 5492 * @bonding_info: info to dispatch 5493 * 5494 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 5495 * The caller must hold the RTNL lock. 5496 */ 5497 void netdev_bonding_info_change(struct net_device *dev, 5498 struct netdev_bonding_info *bonding_info) 5499 { 5500 struct netdev_notifier_bonding_info info; 5501 5502 memcpy(&info.bonding_info, bonding_info, 5503 sizeof(struct netdev_bonding_info)); 5504 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 5505 &info.info); 5506 } 5507 EXPORT_SYMBOL(netdev_bonding_info_change); 5508 5509 static void netdev_adjacent_add_links(struct net_device *dev) 5510 { 5511 struct netdev_adjacent *iter; 5512 5513 struct net *net = dev_net(dev); 5514 5515 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5516 if (!net_eq(net,dev_net(iter->dev))) 5517 continue; 5518 netdev_adjacent_sysfs_add(iter->dev, dev, 5519 &iter->dev->adj_list.lower); 5520 netdev_adjacent_sysfs_add(dev, iter->dev, 5521 &dev->adj_list.upper); 5522 } 5523 5524 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5525 if (!net_eq(net,dev_net(iter->dev))) 5526 continue; 5527 netdev_adjacent_sysfs_add(iter->dev, dev, 5528 &iter->dev->adj_list.upper); 5529 netdev_adjacent_sysfs_add(dev, iter->dev, 5530 &dev->adj_list.lower); 5531 } 5532 } 5533 5534 static void netdev_adjacent_del_links(struct net_device *dev) 5535 { 5536 struct netdev_adjacent *iter; 5537 5538 struct net *net = dev_net(dev); 5539 5540 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5541 if (!net_eq(net,dev_net(iter->dev))) 5542 continue; 5543 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5544 &iter->dev->adj_list.lower); 5545 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5546 &dev->adj_list.upper); 5547 } 5548 5549 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5550 if (!net_eq(net,dev_net(iter->dev))) 5551 continue; 5552 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5553 &iter->dev->adj_list.upper); 5554 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5555 &dev->adj_list.lower); 5556 } 5557 } 5558 5559 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 5560 { 5561 struct netdev_adjacent *iter; 5562 5563 struct net *net = dev_net(dev); 5564 5565 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5566 if (!net_eq(net,dev_net(iter->dev))) 5567 continue; 5568 netdev_adjacent_sysfs_del(iter->dev, oldname, 5569 &iter->dev->adj_list.lower); 5570 netdev_adjacent_sysfs_add(iter->dev, dev, 5571 &iter->dev->adj_list.lower); 5572 } 5573 5574 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5575 if (!net_eq(net,dev_net(iter->dev))) 5576 continue; 5577 netdev_adjacent_sysfs_del(iter->dev, oldname, 5578 &iter->dev->adj_list.upper); 5579 netdev_adjacent_sysfs_add(iter->dev, dev, 5580 &iter->dev->adj_list.upper); 5581 } 5582 } 5583 5584 void *netdev_lower_dev_get_private(struct net_device *dev, 5585 struct net_device *lower_dev) 5586 { 5587 struct netdev_adjacent *lower; 5588 5589 if (!lower_dev) 5590 return NULL; 5591 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); 5592 if (!lower) 5593 return NULL; 5594 5595 return lower->private; 5596 } 5597 EXPORT_SYMBOL(netdev_lower_dev_get_private); 5598 5599 5600 int dev_get_nest_level(struct net_device *dev, 5601 bool (*type_check)(struct net_device *dev)) 5602 { 5603 struct net_device *lower = NULL; 5604 struct list_head *iter; 5605 int max_nest = -1; 5606 int nest; 5607 5608 ASSERT_RTNL(); 5609 5610 netdev_for_each_lower_dev(dev, lower, iter) { 5611 nest = dev_get_nest_level(lower, type_check); 5612 if (max_nest < nest) 5613 max_nest = nest; 5614 } 5615 5616 if (type_check(dev)) 5617 max_nest++; 5618 5619 return max_nest; 5620 } 5621 EXPORT_SYMBOL(dev_get_nest_level); 5622 5623 static void dev_change_rx_flags(struct net_device *dev, int flags) 5624 { 5625 const struct net_device_ops *ops = dev->netdev_ops; 5626 5627 if (ops->ndo_change_rx_flags) 5628 ops->ndo_change_rx_flags(dev, flags); 5629 } 5630 5631 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 5632 { 5633 unsigned int old_flags = dev->flags; 5634 kuid_t uid; 5635 kgid_t gid; 5636 5637 ASSERT_RTNL(); 5638 5639 dev->flags |= IFF_PROMISC; 5640 dev->promiscuity += inc; 5641 if (dev->promiscuity == 0) { 5642 /* 5643 * Avoid overflow. 5644 * If inc causes overflow, untouch promisc and return error. 5645 */ 5646 if (inc < 0) 5647 dev->flags &= ~IFF_PROMISC; 5648 else { 5649 dev->promiscuity -= inc; 5650 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 5651 dev->name); 5652 return -EOVERFLOW; 5653 } 5654 } 5655 if (dev->flags != old_flags) { 5656 pr_info("device %s %s promiscuous mode\n", 5657 dev->name, 5658 dev->flags & IFF_PROMISC ? "entered" : "left"); 5659 if (audit_enabled) { 5660 current_uid_gid(&uid, &gid); 5661 audit_log(current->audit_context, GFP_ATOMIC, 5662 AUDIT_ANOM_PROMISCUOUS, 5663 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 5664 dev->name, (dev->flags & IFF_PROMISC), 5665 (old_flags & IFF_PROMISC), 5666 from_kuid(&init_user_ns, audit_get_loginuid(current)), 5667 from_kuid(&init_user_ns, uid), 5668 from_kgid(&init_user_ns, gid), 5669 audit_get_sessionid(current)); 5670 } 5671 5672 dev_change_rx_flags(dev, IFF_PROMISC); 5673 } 5674 if (notify) 5675 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 5676 return 0; 5677 } 5678 5679 /** 5680 * dev_set_promiscuity - update promiscuity count on a device 5681 * @dev: device 5682 * @inc: modifier 5683 * 5684 * Add or remove promiscuity from a device. While the count in the device 5685 * remains above zero the interface remains promiscuous. Once it hits zero 5686 * the device reverts back to normal filtering operation. A negative inc 5687 * value is used to drop promiscuity on the device. 5688 * Return 0 if successful or a negative errno code on error. 5689 */ 5690 int dev_set_promiscuity(struct net_device *dev, int inc) 5691 { 5692 unsigned int old_flags = dev->flags; 5693 int err; 5694 5695 err = __dev_set_promiscuity(dev, inc, true); 5696 if (err < 0) 5697 return err; 5698 if (dev->flags != old_flags) 5699 dev_set_rx_mode(dev); 5700 return err; 5701 } 5702 EXPORT_SYMBOL(dev_set_promiscuity); 5703 5704 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 5705 { 5706 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 5707 5708 ASSERT_RTNL(); 5709 5710 dev->flags |= IFF_ALLMULTI; 5711 dev->allmulti += inc; 5712 if (dev->allmulti == 0) { 5713 /* 5714 * Avoid overflow. 5715 * If inc causes overflow, untouch allmulti and return error. 5716 */ 5717 if (inc < 0) 5718 dev->flags &= ~IFF_ALLMULTI; 5719 else { 5720 dev->allmulti -= inc; 5721 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 5722 dev->name); 5723 return -EOVERFLOW; 5724 } 5725 } 5726 if (dev->flags ^ old_flags) { 5727 dev_change_rx_flags(dev, IFF_ALLMULTI); 5728 dev_set_rx_mode(dev); 5729 if (notify) 5730 __dev_notify_flags(dev, old_flags, 5731 dev->gflags ^ old_gflags); 5732 } 5733 return 0; 5734 } 5735 5736 /** 5737 * dev_set_allmulti - update allmulti count on a device 5738 * @dev: device 5739 * @inc: modifier 5740 * 5741 * Add or remove reception of all multicast frames to a device. While the 5742 * count in the device remains above zero the interface remains listening 5743 * to all interfaces. Once it hits zero the device reverts back to normal 5744 * filtering operation. A negative @inc value is used to drop the counter 5745 * when releasing a resource needing all multicasts. 5746 * Return 0 if successful or a negative errno code on error. 5747 */ 5748 5749 int dev_set_allmulti(struct net_device *dev, int inc) 5750 { 5751 return __dev_set_allmulti(dev, inc, true); 5752 } 5753 EXPORT_SYMBOL(dev_set_allmulti); 5754 5755 /* 5756 * Upload unicast and multicast address lists to device and 5757 * configure RX filtering. When the device doesn't support unicast 5758 * filtering it is put in promiscuous mode while unicast addresses 5759 * are present. 5760 */ 5761 void __dev_set_rx_mode(struct net_device *dev) 5762 { 5763 const struct net_device_ops *ops = dev->netdev_ops; 5764 5765 /* dev_open will call this function so the list will stay sane. */ 5766 if (!(dev->flags&IFF_UP)) 5767 return; 5768 5769 if (!netif_device_present(dev)) 5770 return; 5771 5772 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 5773 /* Unicast addresses changes may only happen under the rtnl, 5774 * therefore calling __dev_set_promiscuity here is safe. 5775 */ 5776 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 5777 __dev_set_promiscuity(dev, 1, false); 5778 dev->uc_promisc = true; 5779 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 5780 __dev_set_promiscuity(dev, -1, false); 5781 dev->uc_promisc = false; 5782 } 5783 } 5784 5785 if (ops->ndo_set_rx_mode) 5786 ops->ndo_set_rx_mode(dev); 5787 } 5788 5789 void dev_set_rx_mode(struct net_device *dev) 5790 { 5791 netif_addr_lock_bh(dev); 5792 __dev_set_rx_mode(dev); 5793 netif_addr_unlock_bh(dev); 5794 } 5795 5796 /** 5797 * dev_get_flags - get flags reported to userspace 5798 * @dev: device 5799 * 5800 * Get the combination of flag bits exported through APIs to userspace. 5801 */ 5802 unsigned int dev_get_flags(const struct net_device *dev) 5803 { 5804 unsigned int flags; 5805 5806 flags = (dev->flags & ~(IFF_PROMISC | 5807 IFF_ALLMULTI | 5808 IFF_RUNNING | 5809 IFF_LOWER_UP | 5810 IFF_DORMANT)) | 5811 (dev->gflags & (IFF_PROMISC | 5812 IFF_ALLMULTI)); 5813 5814 if (netif_running(dev)) { 5815 if (netif_oper_up(dev)) 5816 flags |= IFF_RUNNING; 5817 if (netif_carrier_ok(dev)) 5818 flags |= IFF_LOWER_UP; 5819 if (netif_dormant(dev)) 5820 flags |= IFF_DORMANT; 5821 } 5822 5823 return flags; 5824 } 5825 EXPORT_SYMBOL(dev_get_flags); 5826 5827 int __dev_change_flags(struct net_device *dev, unsigned int flags) 5828 { 5829 unsigned int old_flags = dev->flags; 5830 int ret; 5831 5832 ASSERT_RTNL(); 5833 5834 /* 5835 * Set the flags on our device. 5836 */ 5837 5838 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 5839 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 5840 IFF_AUTOMEDIA)) | 5841 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 5842 IFF_ALLMULTI)); 5843 5844 /* 5845 * Load in the correct multicast list now the flags have changed. 5846 */ 5847 5848 if ((old_flags ^ flags) & IFF_MULTICAST) 5849 dev_change_rx_flags(dev, IFF_MULTICAST); 5850 5851 dev_set_rx_mode(dev); 5852 5853 /* 5854 * Have we downed the interface. We handle IFF_UP ourselves 5855 * according to user attempts to set it, rather than blindly 5856 * setting it. 5857 */ 5858 5859 ret = 0; 5860 if ((old_flags ^ flags) & IFF_UP) 5861 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 5862 5863 if ((flags ^ dev->gflags) & IFF_PROMISC) { 5864 int inc = (flags & IFF_PROMISC) ? 1 : -1; 5865 unsigned int old_flags = dev->flags; 5866 5867 dev->gflags ^= IFF_PROMISC; 5868 5869 if (__dev_set_promiscuity(dev, inc, false) >= 0) 5870 if (dev->flags != old_flags) 5871 dev_set_rx_mode(dev); 5872 } 5873 5874 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 5875 is important. Some (broken) drivers set IFF_PROMISC, when 5876 IFF_ALLMULTI is requested not asking us and not reporting. 5877 */ 5878 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 5879 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 5880 5881 dev->gflags ^= IFF_ALLMULTI; 5882 __dev_set_allmulti(dev, inc, false); 5883 } 5884 5885 return ret; 5886 } 5887 5888 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 5889 unsigned int gchanges) 5890 { 5891 unsigned int changes = dev->flags ^ old_flags; 5892 5893 if (gchanges) 5894 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 5895 5896 if (changes & IFF_UP) { 5897 if (dev->flags & IFF_UP) 5898 call_netdevice_notifiers(NETDEV_UP, dev); 5899 else 5900 call_netdevice_notifiers(NETDEV_DOWN, dev); 5901 } 5902 5903 if (dev->flags & IFF_UP && 5904 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 5905 struct netdev_notifier_change_info change_info; 5906 5907 change_info.flags_changed = changes; 5908 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 5909 &change_info.info); 5910 } 5911 } 5912 5913 /** 5914 * dev_change_flags - change device settings 5915 * @dev: device 5916 * @flags: device state flags 5917 * 5918 * Change settings on device based state flags. The flags are 5919 * in the userspace exported format. 5920 */ 5921 int dev_change_flags(struct net_device *dev, unsigned int flags) 5922 { 5923 int ret; 5924 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 5925 5926 ret = __dev_change_flags(dev, flags); 5927 if (ret < 0) 5928 return ret; 5929 5930 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 5931 __dev_notify_flags(dev, old_flags, changes); 5932 return ret; 5933 } 5934 EXPORT_SYMBOL(dev_change_flags); 5935 5936 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 5937 { 5938 const struct net_device_ops *ops = dev->netdev_ops; 5939 5940 if (ops->ndo_change_mtu) 5941 return ops->ndo_change_mtu(dev, new_mtu); 5942 5943 dev->mtu = new_mtu; 5944 return 0; 5945 } 5946 5947 /** 5948 * dev_set_mtu - Change maximum transfer unit 5949 * @dev: device 5950 * @new_mtu: new transfer unit 5951 * 5952 * Change the maximum transfer size of the network device. 5953 */ 5954 int dev_set_mtu(struct net_device *dev, int new_mtu) 5955 { 5956 int err, orig_mtu; 5957 5958 if (new_mtu == dev->mtu) 5959 return 0; 5960 5961 /* MTU must be positive. */ 5962 if (new_mtu < 0) 5963 return -EINVAL; 5964 5965 if (!netif_device_present(dev)) 5966 return -ENODEV; 5967 5968 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 5969 err = notifier_to_errno(err); 5970 if (err) 5971 return err; 5972 5973 orig_mtu = dev->mtu; 5974 err = __dev_set_mtu(dev, new_mtu); 5975 5976 if (!err) { 5977 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 5978 err = notifier_to_errno(err); 5979 if (err) { 5980 /* setting mtu back and notifying everyone again, 5981 * so that they have a chance to revert changes. 5982 */ 5983 __dev_set_mtu(dev, orig_mtu); 5984 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 5985 } 5986 } 5987 return err; 5988 } 5989 EXPORT_SYMBOL(dev_set_mtu); 5990 5991 /** 5992 * dev_set_group - Change group this device belongs to 5993 * @dev: device 5994 * @new_group: group this device should belong to 5995 */ 5996 void dev_set_group(struct net_device *dev, int new_group) 5997 { 5998 dev->group = new_group; 5999 } 6000 EXPORT_SYMBOL(dev_set_group); 6001 6002 /** 6003 * dev_set_mac_address - Change Media Access Control Address 6004 * @dev: device 6005 * @sa: new address 6006 * 6007 * Change the hardware (MAC) address of the device 6008 */ 6009 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 6010 { 6011 const struct net_device_ops *ops = dev->netdev_ops; 6012 int err; 6013 6014 if (!ops->ndo_set_mac_address) 6015 return -EOPNOTSUPP; 6016 if (sa->sa_family != dev->type) 6017 return -EINVAL; 6018 if (!netif_device_present(dev)) 6019 return -ENODEV; 6020 err = ops->ndo_set_mac_address(dev, sa); 6021 if (err) 6022 return err; 6023 dev->addr_assign_type = NET_ADDR_SET; 6024 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 6025 add_device_randomness(dev->dev_addr, dev->addr_len); 6026 return 0; 6027 } 6028 EXPORT_SYMBOL(dev_set_mac_address); 6029 6030 /** 6031 * dev_change_carrier - Change device carrier 6032 * @dev: device 6033 * @new_carrier: new value 6034 * 6035 * Change device carrier 6036 */ 6037 int dev_change_carrier(struct net_device *dev, bool new_carrier) 6038 { 6039 const struct net_device_ops *ops = dev->netdev_ops; 6040 6041 if (!ops->ndo_change_carrier) 6042 return -EOPNOTSUPP; 6043 if (!netif_device_present(dev)) 6044 return -ENODEV; 6045 return ops->ndo_change_carrier(dev, new_carrier); 6046 } 6047 EXPORT_SYMBOL(dev_change_carrier); 6048 6049 /** 6050 * dev_get_phys_port_id - Get device physical port ID 6051 * @dev: device 6052 * @ppid: port ID 6053 * 6054 * Get device physical port ID 6055 */ 6056 int dev_get_phys_port_id(struct net_device *dev, 6057 struct netdev_phys_item_id *ppid) 6058 { 6059 const struct net_device_ops *ops = dev->netdev_ops; 6060 6061 if (!ops->ndo_get_phys_port_id) 6062 return -EOPNOTSUPP; 6063 return ops->ndo_get_phys_port_id(dev, ppid); 6064 } 6065 EXPORT_SYMBOL(dev_get_phys_port_id); 6066 6067 /** 6068 * dev_get_phys_port_name - Get device physical port name 6069 * @dev: device 6070 * @name: port name 6071 * 6072 * Get device physical port name 6073 */ 6074 int dev_get_phys_port_name(struct net_device *dev, 6075 char *name, size_t len) 6076 { 6077 const struct net_device_ops *ops = dev->netdev_ops; 6078 6079 if (!ops->ndo_get_phys_port_name) 6080 return -EOPNOTSUPP; 6081 return ops->ndo_get_phys_port_name(dev, name, len); 6082 } 6083 EXPORT_SYMBOL(dev_get_phys_port_name); 6084 6085 /** 6086 * dev_new_index - allocate an ifindex 6087 * @net: the applicable net namespace 6088 * 6089 * Returns a suitable unique value for a new device interface 6090 * number. The caller must hold the rtnl semaphore or the 6091 * dev_base_lock to be sure it remains unique. 6092 */ 6093 static int dev_new_index(struct net *net) 6094 { 6095 int ifindex = net->ifindex; 6096 for (;;) { 6097 if (++ifindex <= 0) 6098 ifindex = 1; 6099 if (!__dev_get_by_index(net, ifindex)) 6100 return net->ifindex = ifindex; 6101 } 6102 } 6103 6104 /* Delayed registration/unregisteration */ 6105 static LIST_HEAD(net_todo_list); 6106 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 6107 6108 static void net_set_todo(struct net_device *dev) 6109 { 6110 list_add_tail(&dev->todo_list, &net_todo_list); 6111 dev_net(dev)->dev_unreg_count++; 6112 } 6113 6114 static void rollback_registered_many(struct list_head *head) 6115 { 6116 struct net_device *dev, *tmp; 6117 LIST_HEAD(close_head); 6118 6119 BUG_ON(dev_boot_phase); 6120 ASSERT_RTNL(); 6121 6122 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 6123 /* Some devices call without registering 6124 * for initialization unwind. Remove those 6125 * devices and proceed with the remaining. 6126 */ 6127 if (dev->reg_state == NETREG_UNINITIALIZED) { 6128 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 6129 dev->name, dev); 6130 6131 WARN_ON(1); 6132 list_del(&dev->unreg_list); 6133 continue; 6134 } 6135 dev->dismantle = true; 6136 BUG_ON(dev->reg_state != NETREG_REGISTERED); 6137 } 6138 6139 /* If device is running, close it first. */ 6140 list_for_each_entry(dev, head, unreg_list) 6141 list_add_tail(&dev->close_list, &close_head); 6142 dev_close_many(&close_head, true); 6143 6144 list_for_each_entry(dev, head, unreg_list) { 6145 /* And unlink it from device chain. */ 6146 unlist_netdevice(dev); 6147 6148 dev->reg_state = NETREG_UNREGISTERING; 6149 } 6150 6151 synchronize_net(); 6152 6153 list_for_each_entry(dev, head, unreg_list) { 6154 struct sk_buff *skb = NULL; 6155 6156 /* Shutdown queueing discipline. */ 6157 dev_shutdown(dev); 6158 6159 6160 /* Notify protocols, that we are about to destroy 6161 this device. They should clean all the things. 6162 */ 6163 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6164 6165 if (!dev->rtnl_link_ops || 6166 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6167 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 6168 GFP_KERNEL); 6169 6170 /* 6171 * Flush the unicast and multicast chains 6172 */ 6173 dev_uc_flush(dev); 6174 dev_mc_flush(dev); 6175 6176 if (dev->netdev_ops->ndo_uninit) 6177 dev->netdev_ops->ndo_uninit(dev); 6178 6179 if (skb) 6180 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 6181 6182 /* Notifier chain MUST detach us all upper devices. */ 6183 WARN_ON(netdev_has_any_upper_dev(dev)); 6184 6185 /* Remove entries from kobject tree */ 6186 netdev_unregister_kobject(dev); 6187 #ifdef CONFIG_XPS 6188 /* Remove XPS queueing entries */ 6189 netif_reset_xps_queues_gt(dev, 0); 6190 #endif 6191 } 6192 6193 synchronize_net(); 6194 6195 list_for_each_entry(dev, head, unreg_list) 6196 dev_put(dev); 6197 } 6198 6199 static void rollback_registered(struct net_device *dev) 6200 { 6201 LIST_HEAD(single); 6202 6203 list_add(&dev->unreg_list, &single); 6204 rollback_registered_many(&single); 6205 list_del(&single); 6206 } 6207 6208 static netdev_features_t netdev_fix_features(struct net_device *dev, 6209 netdev_features_t features) 6210 { 6211 /* Fix illegal checksum combinations */ 6212 if ((features & NETIF_F_HW_CSUM) && 6213 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6214 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 6215 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 6216 } 6217 6218 /* TSO requires that SG is present as well. */ 6219 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 6220 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 6221 features &= ~NETIF_F_ALL_TSO; 6222 } 6223 6224 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 6225 !(features & NETIF_F_IP_CSUM)) { 6226 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 6227 features &= ~NETIF_F_TSO; 6228 features &= ~NETIF_F_TSO_ECN; 6229 } 6230 6231 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 6232 !(features & NETIF_F_IPV6_CSUM)) { 6233 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 6234 features &= ~NETIF_F_TSO6; 6235 } 6236 6237 /* TSO ECN requires that TSO is present as well. */ 6238 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 6239 features &= ~NETIF_F_TSO_ECN; 6240 6241 /* Software GSO depends on SG. */ 6242 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 6243 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 6244 features &= ~NETIF_F_GSO; 6245 } 6246 6247 /* UFO needs SG and checksumming */ 6248 if (features & NETIF_F_UFO) { 6249 /* maybe split UFO into V4 and V6? */ 6250 if (!((features & NETIF_F_GEN_CSUM) || 6251 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) 6252 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6253 netdev_dbg(dev, 6254 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 6255 features &= ~NETIF_F_UFO; 6256 } 6257 6258 if (!(features & NETIF_F_SG)) { 6259 netdev_dbg(dev, 6260 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 6261 features &= ~NETIF_F_UFO; 6262 } 6263 } 6264 6265 #ifdef CONFIG_NET_RX_BUSY_POLL 6266 if (dev->netdev_ops->ndo_busy_poll) 6267 features |= NETIF_F_BUSY_POLL; 6268 else 6269 #endif 6270 features &= ~NETIF_F_BUSY_POLL; 6271 6272 return features; 6273 } 6274 6275 int __netdev_update_features(struct net_device *dev) 6276 { 6277 netdev_features_t features; 6278 int err = 0; 6279 6280 ASSERT_RTNL(); 6281 6282 features = netdev_get_wanted_features(dev); 6283 6284 if (dev->netdev_ops->ndo_fix_features) 6285 features = dev->netdev_ops->ndo_fix_features(dev, features); 6286 6287 /* driver might be less strict about feature dependencies */ 6288 features = netdev_fix_features(dev, features); 6289 6290 if (dev->features == features) 6291 return 0; 6292 6293 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 6294 &dev->features, &features); 6295 6296 if (dev->netdev_ops->ndo_set_features) 6297 err = dev->netdev_ops->ndo_set_features(dev, features); 6298 6299 if (unlikely(err < 0)) { 6300 netdev_err(dev, 6301 "set_features() failed (%d); wanted %pNF, left %pNF\n", 6302 err, &features, &dev->features); 6303 return -1; 6304 } 6305 6306 if (!err) 6307 dev->features = features; 6308 6309 return 1; 6310 } 6311 6312 /** 6313 * netdev_update_features - recalculate device features 6314 * @dev: the device to check 6315 * 6316 * Recalculate dev->features set and send notifications if it 6317 * has changed. Should be called after driver or hardware dependent 6318 * conditions might have changed that influence the features. 6319 */ 6320 void netdev_update_features(struct net_device *dev) 6321 { 6322 if (__netdev_update_features(dev)) 6323 netdev_features_change(dev); 6324 } 6325 EXPORT_SYMBOL(netdev_update_features); 6326 6327 /** 6328 * netdev_change_features - recalculate device features 6329 * @dev: the device to check 6330 * 6331 * Recalculate dev->features set and send notifications even 6332 * if they have not changed. Should be called instead of 6333 * netdev_update_features() if also dev->vlan_features might 6334 * have changed to allow the changes to be propagated to stacked 6335 * VLAN devices. 6336 */ 6337 void netdev_change_features(struct net_device *dev) 6338 { 6339 __netdev_update_features(dev); 6340 netdev_features_change(dev); 6341 } 6342 EXPORT_SYMBOL(netdev_change_features); 6343 6344 /** 6345 * netif_stacked_transfer_operstate - transfer operstate 6346 * @rootdev: the root or lower level device to transfer state from 6347 * @dev: the device to transfer operstate to 6348 * 6349 * Transfer operational state from root to device. This is normally 6350 * called when a stacking relationship exists between the root 6351 * device and the device(a leaf device). 6352 */ 6353 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 6354 struct net_device *dev) 6355 { 6356 if (rootdev->operstate == IF_OPER_DORMANT) 6357 netif_dormant_on(dev); 6358 else 6359 netif_dormant_off(dev); 6360 6361 if (netif_carrier_ok(rootdev)) { 6362 if (!netif_carrier_ok(dev)) 6363 netif_carrier_on(dev); 6364 } else { 6365 if (netif_carrier_ok(dev)) 6366 netif_carrier_off(dev); 6367 } 6368 } 6369 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 6370 6371 #ifdef CONFIG_SYSFS 6372 static int netif_alloc_rx_queues(struct net_device *dev) 6373 { 6374 unsigned int i, count = dev->num_rx_queues; 6375 struct netdev_rx_queue *rx; 6376 size_t sz = count * sizeof(*rx); 6377 6378 BUG_ON(count < 1); 6379 6380 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6381 if (!rx) { 6382 rx = vzalloc(sz); 6383 if (!rx) 6384 return -ENOMEM; 6385 } 6386 dev->_rx = rx; 6387 6388 for (i = 0; i < count; i++) 6389 rx[i].dev = dev; 6390 return 0; 6391 } 6392 #endif 6393 6394 static void netdev_init_one_queue(struct net_device *dev, 6395 struct netdev_queue *queue, void *_unused) 6396 { 6397 /* Initialize queue lock */ 6398 spin_lock_init(&queue->_xmit_lock); 6399 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 6400 queue->xmit_lock_owner = -1; 6401 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 6402 queue->dev = dev; 6403 #ifdef CONFIG_BQL 6404 dql_init(&queue->dql, HZ); 6405 #endif 6406 } 6407 6408 static void netif_free_tx_queues(struct net_device *dev) 6409 { 6410 kvfree(dev->_tx); 6411 } 6412 6413 static int netif_alloc_netdev_queues(struct net_device *dev) 6414 { 6415 unsigned int count = dev->num_tx_queues; 6416 struct netdev_queue *tx; 6417 size_t sz = count * sizeof(*tx); 6418 6419 BUG_ON(count < 1 || count > 0xffff); 6420 6421 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6422 if (!tx) { 6423 tx = vzalloc(sz); 6424 if (!tx) 6425 return -ENOMEM; 6426 } 6427 dev->_tx = tx; 6428 6429 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 6430 spin_lock_init(&dev->tx_global_lock); 6431 6432 return 0; 6433 } 6434 6435 void netif_tx_stop_all_queues(struct net_device *dev) 6436 { 6437 unsigned int i; 6438 6439 for (i = 0; i < dev->num_tx_queues; i++) { 6440 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 6441 netif_tx_stop_queue(txq); 6442 } 6443 } 6444 EXPORT_SYMBOL(netif_tx_stop_all_queues); 6445 6446 /** 6447 * register_netdevice - register a network device 6448 * @dev: device to register 6449 * 6450 * Take a completed network device structure and add it to the kernel 6451 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6452 * chain. 0 is returned on success. A negative errno code is returned 6453 * on a failure to set up the device, or if the name is a duplicate. 6454 * 6455 * Callers must hold the rtnl semaphore. You may want 6456 * register_netdev() instead of this. 6457 * 6458 * BUGS: 6459 * The locking appears insufficient to guarantee two parallel registers 6460 * will not get the same name. 6461 */ 6462 6463 int register_netdevice(struct net_device *dev) 6464 { 6465 int ret; 6466 struct net *net = dev_net(dev); 6467 6468 BUG_ON(dev_boot_phase); 6469 ASSERT_RTNL(); 6470 6471 might_sleep(); 6472 6473 /* When net_device's are persistent, this will be fatal. */ 6474 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 6475 BUG_ON(!net); 6476 6477 spin_lock_init(&dev->addr_list_lock); 6478 netdev_set_addr_lockdep_class(dev); 6479 6480 ret = dev_get_valid_name(net, dev, dev->name); 6481 if (ret < 0) 6482 goto out; 6483 6484 /* Init, if this function is available */ 6485 if (dev->netdev_ops->ndo_init) { 6486 ret = dev->netdev_ops->ndo_init(dev); 6487 if (ret) { 6488 if (ret > 0) 6489 ret = -EIO; 6490 goto out; 6491 } 6492 } 6493 6494 if (((dev->hw_features | dev->features) & 6495 NETIF_F_HW_VLAN_CTAG_FILTER) && 6496 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 6497 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 6498 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 6499 ret = -EINVAL; 6500 goto err_uninit; 6501 } 6502 6503 ret = -EBUSY; 6504 if (!dev->ifindex) 6505 dev->ifindex = dev_new_index(net); 6506 else if (__dev_get_by_index(net, dev->ifindex)) 6507 goto err_uninit; 6508 6509 /* Transfer changeable features to wanted_features and enable 6510 * software offloads (GSO and GRO). 6511 */ 6512 dev->hw_features |= NETIF_F_SOFT_FEATURES; 6513 dev->features |= NETIF_F_SOFT_FEATURES; 6514 dev->wanted_features = dev->features & dev->hw_features; 6515 6516 if (!(dev->flags & IFF_LOOPBACK)) { 6517 dev->hw_features |= NETIF_F_NOCACHE_COPY; 6518 } 6519 6520 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 6521 */ 6522 dev->vlan_features |= NETIF_F_HIGHDMA; 6523 6524 /* Make NETIF_F_SG inheritable to tunnel devices. 6525 */ 6526 dev->hw_enc_features |= NETIF_F_SG; 6527 6528 /* Make NETIF_F_SG inheritable to MPLS. 6529 */ 6530 dev->mpls_features |= NETIF_F_SG; 6531 6532 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 6533 ret = notifier_to_errno(ret); 6534 if (ret) 6535 goto err_uninit; 6536 6537 ret = netdev_register_kobject(dev); 6538 if (ret) 6539 goto err_uninit; 6540 dev->reg_state = NETREG_REGISTERED; 6541 6542 __netdev_update_features(dev); 6543 6544 /* 6545 * Default initial state at registry is that the 6546 * device is present. 6547 */ 6548 6549 set_bit(__LINK_STATE_PRESENT, &dev->state); 6550 6551 linkwatch_init_dev(dev); 6552 6553 dev_init_scheduler(dev); 6554 dev_hold(dev); 6555 list_netdevice(dev); 6556 add_device_randomness(dev->dev_addr, dev->addr_len); 6557 6558 /* If the device has permanent device address, driver should 6559 * set dev_addr and also addr_assign_type should be set to 6560 * NET_ADDR_PERM (default value). 6561 */ 6562 if (dev->addr_assign_type == NET_ADDR_PERM) 6563 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 6564 6565 /* Notify protocols, that a new device appeared. */ 6566 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 6567 ret = notifier_to_errno(ret); 6568 if (ret) { 6569 rollback_registered(dev); 6570 dev->reg_state = NETREG_UNREGISTERED; 6571 } 6572 /* 6573 * Prevent userspace races by waiting until the network 6574 * device is fully setup before sending notifications. 6575 */ 6576 if (!dev->rtnl_link_ops || 6577 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6578 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 6579 6580 out: 6581 return ret; 6582 6583 err_uninit: 6584 if (dev->netdev_ops->ndo_uninit) 6585 dev->netdev_ops->ndo_uninit(dev); 6586 goto out; 6587 } 6588 EXPORT_SYMBOL(register_netdevice); 6589 6590 /** 6591 * init_dummy_netdev - init a dummy network device for NAPI 6592 * @dev: device to init 6593 * 6594 * This takes a network device structure and initialize the minimum 6595 * amount of fields so it can be used to schedule NAPI polls without 6596 * registering a full blown interface. This is to be used by drivers 6597 * that need to tie several hardware interfaces to a single NAPI 6598 * poll scheduler due to HW limitations. 6599 */ 6600 int init_dummy_netdev(struct net_device *dev) 6601 { 6602 /* Clear everything. Note we don't initialize spinlocks 6603 * are they aren't supposed to be taken by any of the 6604 * NAPI code and this dummy netdev is supposed to be 6605 * only ever used for NAPI polls 6606 */ 6607 memset(dev, 0, sizeof(struct net_device)); 6608 6609 /* make sure we BUG if trying to hit standard 6610 * register/unregister code path 6611 */ 6612 dev->reg_state = NETREG_DUMMY; 6613 6614 /* NAPI wants this */ 6615 INIT_LIST_HEAD(&dev->napi_list); 6616 6617 /* a dummy interface is started by default */ 6618 set_bit(__LINK_STATE_PRESENT, &dev->state); 6619 set_bit(__LINK_STATE_START, &dev->state); 6620 6621 /* Note : We dont allocate pcpu_refcnt for dummy devices, 6622 * because users of this 'device' dont need to change 6623 * its refcount. 6624 */ 6625 6626 return 0; 6627 } 6628 EXPORT_SYMBOL_GPL(init_dummy_netdev); 6629 6630 6631 /** 6632 * register_netdev - register a network device 6633 * @dev: device to register 6634 * 6635 * Take a completed network device structure and add it to the kernel 6636 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6637 * chain. 0 is returned on success. A negative errno code is returned 6638 * on a failure to set up the device, or if the name is a duplicate. 6639 * 6640 * This is a wrapper around register_netdevice that takes the rtnl semaphore 6641 * and expands the device name if you passed a format string to 6642 * alloc_netdev. 6643 */ 6644 int register_netdev(struct net_device *dev) 6645 { 6646 int err; 6647 6648 rtnl_lock(); 6649 err = register_netdevice(dev); 6650 rtnl_unlock(); 6651 return err; 6652 } 6653 EXPORT_SYMBOL(register_netdev); 6654 6655 int netdev_refcnt_read(const struct net_device *dev) 6656 { 6657 int i, refcnt = 0; 6658 6659 for_each_possible_cpu(i) 6660 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 6661 return refcnt; 6662 } 6663 EXPORT_SYMBOL(netdev_refcnt_read); 6664 6665 /** 6666 * netdev_wait_allrefs - wait until all references are gone. 6667 * @dev: target net_device 6668 * 6669 * This is called when unregistering network devices. 6670 * 6671 * Any protocol or device that holds a reference should register 6672 * for netdevice notification, and cleanup and put back the 6673 * reference if they receive an UNREGISTER event. 6674 * We can get stuck here if buggy protocols don't correctly 6675 * call dev_put. 6676 */ 6677 static void netdev_wait_allrefs(struct net_device *dev) 6678 { 6679 unsigned long rebroadcast_time, warning_time; 6680 int refcnt; 6681 6682 linkwatch_forget_dev(dev); 6683 6684 rebroadcast_time = warning_time = jiffies; 6685 refcnt = netdev_refcnt_read(dev); 6686 6687 while (refcnt != 0) { 6688 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 6689 rtnl_lock(); 6690 6691 /* Rebroadcast unregister notification */ 6692 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6693 6694 __rtnl_unlock(); 6695 rcu_barrier(); 6696 rtnl_lock(); 6697 6698 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 6699 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 6700 &dev->state)) { 6701 /* We must not have linkwatch events 6702 * pending on unregister. If this 6703 * happens, we simply run the queue 6704 * unscheduled, resulting in a noop 6705 * for this device. 6706 */ 6707 linkwatch_run_queue(); 6708 } 6709 6710 __rtnl_unlock(); 6711 6712 rebroadcast_time = jiffies; 6713 } 6714 6715 msleep(250); 6716 6717 refcnt = netdev_refcnt_read(dev); 6718 6719 if (time_after(jiffies, warning_time + 10 * HZ)) { 6720 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 6721 dev->name, refcnt); 6722 warning_time = jiffies; 6723 } 6724 } 6725 } 6726 6727 /* The sequence is: 6728 * 6729 * rtnl_lock(); 6730 * ... 6731 * register_netdevice(x1); 6732 * register_netdevice(x2); 6733 * ... 6734 * unregister_netdevice(y1); 6735 * unregister_netdevice(y2); 6736 * ... 6737 * rtnl_unlock(); 6738 * free_netdev(y1); 6739 * free_netdev(y2); 6740 * 6741 * We are invoked by rtnl_unlock(). 6742 * This allows us to deal with problems: 6743 * 1) We can delete sysfs objects which invoke hotplug 6744 * without deadlocking with linkwatch via keventd. 6745 * 2) Since we run with the RTNL semaphore not held, we can sleep 6746 * safely in order to wait for the netdev refcnt to drop to zero. 6747 * 6748 * We must not return until all unregister events added during 6749 * the interval the lock was held have been completed. 6750 */ 6751 void netdev_run_todo(void) 6752 { 6753 struct list_head list; 6754 6755 /* Snapshot list, allow later requests */ 6756 list_replace_init(&net_todo_list, &list); 6757 6758 __rtnl_unlock(); 6759 6760 6761 /* Wait for rcu callbacks to finish before next phase */ 6762 if (!list_empty(&list)) 6763 rcu_barrier(); 6764 6765 while (!list_empty(&list)) { 6766 struct net_device *dev 6767 = list_first_entry(&list, struct net_device, todo_list); 6768 list_del(&dev->todo_list); 6769 6770 rtnl_lock(); 6771 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 6772 __rtnl_unlock(); 6773 6774 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 6775 pr_err("network todo '%s' but state %d\n", 6776 dev->name, dev->reg_state); 6777 dump_stack(); 6778 continue; 6779 } 6780 6781 dev->reg_state = NETREG_UNREGISTERED; 6782 6783 on_each_cpu(flush_backlog, dev, 1); 6784 6785 netdev_wait_allrefs(dev); 6786 6787 /* paranoia */ 6788 BUG_ON(netdev_refcnt_read(dev)); 6789 BUG_ON(!list_empty(&dev->ptype_all)); 6790 BUG_ON(!list_empty(&dev->ptype_specific)); 6791 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 6792 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 6793 WARN_ON(dev->dn_ptr); 6794 6795 if (dev->destructor) 6796 dev->destructor(dev); 6797 6798 /* Report a network device has been unregistered */ 6799 rtnl_lock(); 6800 dev_net(dev)->dev_unreg_count--; 6801 __rtnl_unlock(); 6802 wake_up(&netdev_unregistering_wq); 6803 6804 /* Free network device */ 6805 kobject_put(&dev->dev.kobj); 6806 } 6807 } 6808 6809 /* Convert net_device_stats to rtnl_link_stats64. They have the same 6810 * fields in the same order, with only the type differing. 6811 */ 6812 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 6813 const struct net_device_stats *netdev_stats) 6814 { 6815 #if BITS_PER_LONG == 64 6816 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); 6817 memcpy(stats64, netdev_stats, sizeof(*stats64)); 6818 #else 6819 size_t i, n = sizeof(*stats64) / sizeof(u64); 6820 const unsigned long *src = (const unsigned long *)netdev_stats; 6821 u64 *dst = (u64 *)stats64; 6822 6823 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != 6824 sizeof(*stats64) / sizeof(u64)); 6825 for (i = 0; i < n; i++) 6826 dst[i] = src[i]; 6827 #endif 6828 } 6829 EXPORT_SYMBOL(netdev_stats_to_stats64); 6830 6831 /** 6832 * dev_get_stats - get network device statistics 6833 * @dev: device to get statistics from 6834 * @storage: place to store stats 6835 * 6836 * Get network statistics from device. Return @storage. 6837 * The device driver may provide its own method by setting 6838 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 6839 * otherwise the internal statistics structure is used. 6840 */ 6841 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 6842 struct rtnl_link_stats64 *storage) 6843 { 6844 const struct net_device_ops *ops = dev->netdev_ops; 6845 6846 if (ops->ndo_get_stats64) { 6847 memset(storage, 0, sizeof(*storage)); 6848 ops->ndo_get_stats64(dev, storage); 6849 } else if (ops->ndo_get_stats) { 6850 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 6851 } else { 6852 netdev_stats_to_stats64(storage, &dev->stats); 6853 } 6854 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 6855 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 6856 return storage; 6857 } 6858 EXPORT_SYMBOL(dev_get_stats); 6859 6860 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 6861 { 6862 struct netdev_queue *queue = dev_ingress_queue(dev); 6863 6864 #ifdef CONFIG_NET_CLS_ACT 6865 if (queue) 6866 return queue; 6867 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 6868 if (!queue) 6869 return NULL; 6870 netdev_init_one_queue(dev, queue, NULL); 6871 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 6872 queue->qdisc_sleeping = &noop_qdisc; 6873 rcu_assign_pointer(dev->ingress_queue, queue); 6874 #endif 6875 return queue; 6876 } 6877 6878 static const struct ethtool_ops default_ethtool_ops; 6879 6880 void netdev_set_default_ethtool_ops(struct net_device *dev, 6881 const struct ethtool_ops *ops) 6882 { 6883 if (dev->ethtool_ops == &default_ethtool_ops) 6884 dev->ethtool_ops = ops; 6885 } 6886 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 6887 6888 void netdev_freemem(struct net_device *dev) 6889 { 6890 char *addr = (char *)dev - dev->padded; 6891 6892 kvfree(addr); 6893 } 6894 6895 /** 6896 * alloc_netdev_mqs - allocate network device 6897 * @sizeof_priv: size of private data to allocate space for 6898 * @name: device name format string 6899 * @name_assign_type: origin of device name 6900 * @setup: callback to initialize device 6901 * @txqs: the number of TX subqueues to allocate 6902 * @rxqs: the number of RX subqueues to allocate 6903 * 6904 * Allocates a struct net_device with private data area for driver use 6905 * and performs basic initialization. Also allocates subqueue structs 6906 * for each queue on the device. 6907 */ 6908 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 6909 unsigned char name_assign_type, 6910 void (*setup)(struct net_device *), 6911 unsigned int txqs, unsigned int rxqs) 6912 { 6913 struct net_device *dev; 6914 size_t alloc_size; 6915 struct net_device *p; 6916 6917 BUG_ON(strlen(name) >= sizeof(dev->name)); 6918 6919 if (txqs < 1) { 6920 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 6921 return NULL; 6922 } 6923 6924 #ifdef CONFIG_SYSFS 6925 if (rxqs < 1) { 6926 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 6927 return NULL; 6928 } 6929 #endif 6930 6931 alloc_size = sizeof(struct net_device); 6932 if (sizeof_priv) { 6933 /* ensure 32-byte alignment of private area */ 6934 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 6935 alloc_size += sizeof_priv; 6936 } 6937 /* ensure 32-byte alignment of whole construct */ 6938 alloc_size += NETDEV_ALIGN - 1; 6939 6940 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6941 if (!p) 6942 p = vzalloc(alloc_size); 6943 if (!p) 6944 return NULL; 6945 6946 dev = PTR_ALIGN(p, NETDEV_ALIGN); 6947 dev->padded = (char *)dev - (char *)p; 6948 6949 dev->pcpu_refcnt = alloc_percpu(int); 6950 if (!dev->pcpu_refcnt) 6951 goto free_dev; 6952 6953 if (dev_addr_init(dev)) 6954 goto free_pcpu; 6955 6956 dev_mc_init(dev); 6957 dev_uc_init(dev); 6958 6959 dev_net_set(dev, &init_net); 6960 6961 dev->gso_max_size = GSO_MAX_SIZE; 6962 dev->gso_max_segs = GSO_MAX_SEGS; 6963 dev->gso_min_segs = 0; 6964 6965 INIT_LIST_HEAD(&dev->napi_list); 6966 INIT_LIST_HEAD(&dev->unreg_list); 6967 INIT_LIST_HEAD(&dev->close_list); 6968 INIT_LIST_HEAD(&dev->link_watch_list); 6969 INIT_LIST_HEAD(&dev->adj_list.upper); 6970 INIT_LIST_HEAD(&dev->adj_list.lower); 6971 INIT_LIST_HEAD(&dev->all_adj_list.upper); 6972 INIT_LIST_HEAD(&dev->all_adj_list.lower); 6973 INIT_LIST_HEAD(&dev->ptype_all); 6974 INIT_LIST_HEAD(&dev->ptype_specific); 6975 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 6976 setup(dev); 6977 6978 dev->num_tx_queues = txqs; 6979 dev->real_num_tx_queues = txqs; 6980 if (netif_alloc_netdev_queues(dev)) 6981 goto free_all; 6982 6983 #ifdef CONFIG_SYSFS 6984 dev->num_rx_queues = rxqs; 6985 dev->real_num_rx_queues = rxqs; 6986 if (netif_alloc_rx_queues(dev)) 6987 goto free_all; 6988 #endif 6989 6990 strcpy(dev->name, name); 6991 dev->name_assign_type = name_assign_type; 6992 dev->group = INIT_NETDEV_GROUP; 6993 if (!dev->ethtool_ops) 6994 dev->ethtool_ops = &default_ethtool_ops; 6995 6996 nf_hook_ingress_init(dev); 6997 6998 return dev; 6999 7000 free_all: 7001 free_netdev(dev); 7002 return NULL; 7003 7004 free_pcpu: 7005 free_percpu(dev->pcpu_refcnt); 7006 free_dev: 7007 netdev_freemem(dev); 7008 return NULL; 7009 } 7010 EXPORT_SYMBOL(alloc_netdev_mqs); 7011 7012 /** 7013 * free_netdev - free network device 7014 * @dev: device 7015 * 7016 * This function does the last stage of destroying an allocated device 7017 * interface. The reference to the device object is released. 7018 * If this is the last reference then it will be freed. 7019 */ 7020 void free_netdev(struct net_device *dev) 7021 { 7022 struct napi_struct *p, *n; 7023 7024 netif_free_tx_queues(dev); 7025 #ifdef CONFIG_SYSFS 7026 kvfree(dev->_rx); 7027 #endif 7028 7029 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 7030 7031 /* Flush device addresses */ 7032 dev_addr_flush(dev); 7033 7034 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 7035 netif_napi_del(p); 7036 7037 free_percpu(dev->pcpu_refcnt); 7038 dev->pcpu_refcnt = NULL; 7039 7040 /* Compatibility with error handling in drivers */ 7041 if (dev->reg_state == NETREG_UNINITIALIZED) { 7042 netdev_freemem(dev); 7043 return; 7044 } 7045 7046 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 7047 dev->reg_state = NETREG_RELEASED; 7048 7049 /* will free via device release */ 7050 put_device(&dev->dev); 7051 } 7052 EXPORT_SYMBOL(free_netdev); 7053 7054 /** 7055 * synchronize_net - Synchronize with packet receive processing 7056 * 7057 * Wait for packets currently being received to be done. 7058 * Does not block later packets from starting. 7059 */ 7060 void synchronize_net(void) 7061 { 7062 might_sleep(); 7063 if (rtnl_is_locked()) 7064 synchronize_rcu_expedited(); 7065 else 7066 synchronize_rcu(); 7067 } 7068 EXPORT_SYMBOL(synchronize_net); 7069 7070 /** 7071 * unregister_netdevice_queue - remove device from the kernel 7072 * @dev: device 7073 * @head: list 7074 * 7075 * This function shuts down a device interface and removes it 7076 * from the kernel tables. 7077 * If head not NULL, device is queued to be unregistered later. 7078 * 7079 * Callers must hold the rtnl semaphore. You may want 7080 * unregister_netdev() instead of this. 7081 */ 7082 7083 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 7084 { 7085 ASSERT_RTNL(); 7086 7087 if (head) { 7088 list_move_tail(&dev->unreg_list, head); 7089 } else { 7090 rollback_registered(dev); 7091 /* Finish processing unregister after unlock */ 7092 net_set_todo(dev); 7093 } 7094 } 7095 EXPORT_SYMBOL(unregister_netdevice_queue); 7096 7097 /** 7098 * unregister_netdevice_many - unregister many devices 7099 * @head: list of devices 7100 * 7101 * Note: As most callers use a stack allocated list_head, 7102 * we force a list_del() to make sure stack wont be corrupted later. 7103 */ 7104 void unregister_netdevice_many(struct list_head *head) 7105 { 7106 struct net_device *dev; 7107 7108 if (!list_empty(head)) { 7109 rollback_registered_many(head); 7110 list_for_each_entry(dev, head, unreg_list) 7111 net_set_todo(dev); 7112 list_del(head); 7113 } 7114 } 7115 EXPORT_SYMBOL(unregister_netdevice_many); 7116 7117 /** 7118 * unregister_netdev - remove device from the kernel 7119 * @dev: device 7120 * 7121 * This function shuts down a device interface and removes it 7122 * from the kernel tables. 7123 * 7124 * This is just a wrapper for unregister_netdevice that takes 7125 * the rtnl semaphore. In general you want to use this and not 7126 * unregister_netdevice. 7127 */ 7128 void unregister_netdev(struct net_device *dev) 7129 { 7130 rtnl_lock(); 7131 unregister_netdevice(dev); 7132 rtnl_unlock(); 7133 } 7134 EXPORT_SYMBOL(unregister_netdev); 7135 7136 /** 7137 * dev_change_net_namespace - move device to different nethost namespace 7138 * @dev: device 7139 * @net: network namespace 7140 * @pat: If not NULL name pattern to try if the current device name 7141 * is already taken in the destination network namespace. 7142 * 7143 * This function shuts down a device interface and moves it 7144 * to a new network namespace. On success 0 is returned, on 7145 * a failure a netagive errno code is returned. 7146 * 7147 * Callers must hold the rtnl semaphore. 7148 */ 7149 7150 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 7151 { 7152 int err; 7153 7154 ASSERT_RTNL(); 7155 7156 /* Don't allow namespace local devices to be moved. */ 7157 err = -EINVAL; 7158 if (dev->features & NETIF_F_NETNS_LOCAL) 7159 goto out; 7160 7161 /* Ensure the device has been registrered */ 7162 if (dev->reg_state != NETREG_REGISTERED) 7163 goto out; 7164 7165 /* Get out if there is nothing todo */ 7166 err = 0; 7167 if (net_eq(dev_net(dev), net)) 7168 goto out; 7169 7170 /* Pick the destination device name, and ensure 7171 * we can use it in the destination network namespace. 7172 */ 7173 err = -EEXIST; 7174 if (__dev_get_by_name(net, dev->name)) { 7175 /* We get here if we can't use the current device name */ 7176 if (!pat) 7177 goto out; 7178 if (dev_get_valid_name(net, dev, pat) < 0) 7179 goto out; 7180 } 7181 7182 /* 7183 * And now a mini version of register_netdevice unregister_netdevice. 7184 */ 7185 7186 /* If device is running close it first. */ 7187 dev_close(dev); 7188 7189 /* And unlink it from device chain */ 7190 err = -ENODEV; 7191 unlist_netdevice(dev); 7192 7193 synchronize_net(); 7194 7195 /* Shutdown queueing discipline. */ 7196 dev_shutdown(dev); 7197 7198 /* Notify protocols, that we are about to destroy 7199 this device. They should clean all the things. 7200 7201 Note that dev->reg_state stays at NETREG_REGISTERED. 7202 This is wanted because this way 8021q and macvlan know 7203 the device is just moving and can keep their slaves up. 7204 */ 7205 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7206 rcu_barrier(); 7207 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7208 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 7209 7210 /* 7211 * Flush the unicast and multicast chains 7212 */ 7213 dev_uc_flush(dev); 7214 dev_mc_flush(dev); 7215 7216 /* Send a netdev-removed uevent to the old namespace */ 7217 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 7218 netdev_adjacent_del_links(dev); 7219 7220 /* Actually switch the network namespace */ 7221 dev_net_set(dev, net); 7222 7223 /* If there is an ifindex conflict assign a new one */ 7224 if (__dev_get_by_index(net, dev->ifindex)) 7225 dev->ifindex = dev_new_index(net); 7226 7227 /* Send a netdev-add uevent to the new namespace */ 7228 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 7229 netdev_adjacent_add_links(dev); 7230 7231 /* Fixup kobjects */ 7232 err = device_rename(&dev->dev, dev->name); 7233 WARN_ON(err); 7234 7235 /* Add the device back in the hashes */ 7236 list_netdevice(dev); 7237 7238 /* Notify protocols, that a new device appeared. */ 7239 call_netdevice_notifiers(NETDEV_REGISTER, dev); 7240 7241 /* 7242 * Prevent userspace races by waiting until the network 7243 * device is fully setup before sending notifications. 7244 */ 7245 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7246 7247 synchronize_net(); 7248 err = 0; 7249 out: 7250 return err; 7251 } 7252 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 7253 7254 static int dev_cpu_callback(struct notifier_block *nfb, 7255 unsigned long action, 7256 void *ocpu) 7257 { 7258 struct sk_buff **list_skb; 7259 struct sk_buff *skb; 7260 unsigned int cpu, oldcpu = (unsigned long)ocpu; 7261 struct softnet_data *sd, *oldsd; 7262 7263 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 7264 return NOTIFY_OK; 7265 7266 local_irq_disable(); 7267 cpu = smp_processor_id(); 7268 sd = &per_cpu(softnet_data, cpu); 7269 oldsd = &per_cpu(softnet_data, oldcpu); 7270 7271 /* Find end of our completion_queue. */ 7272 list_skb = &sd->completion_queue; 7273 while (*list_skb) 7274 list_skb = &(*list_skb)->next; 7275 /* Append completion queue from offline CPU. */ 7276 *list_skb = oldsd->completion_queue; 7277 oldsd->completion_queue = NULL; 7278 7279 /* Append output queue from offline CPU. */ 7280 if (oldsd->output_queue) { 7281 *sd->output_queue_tailp = oldsd->output_queue; 7282 sd->output_queue_tailp = oldsd->output_queue_tailp; 7283 oldsd->output_queue = NULL; 7284 oldsd->output_queue_tailp = &oldsd->output_queue; 7285 } 7286 /* Append NAPI poll list from offline CPU, with one exception : 7287 * process_backlog() must be called by cpu owning percpu backlog. 7288 * We properly handle process_queue & input_pkt_queue later. 7289 */ 7290 while (!list_empty(&oldsd->poll_list)) { 7291 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 7292 struct napi_struct, 7293 poll_list); 7294 7295 list_del_init(&napi->poll_list); 7296 if (napi->poll == process_backlog) 7297 napi->state = 0; 7298 else 7299 ____napi_schedule(sd, napi); 7300 } 7301 7302 raise_softirq_irqoff(NET_TX_SOFTIRQ); 7303 local_irq_enable(); 7304 7305 /* Process offline CPU's input_pkt_queue */ 7306 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 7307 netif_rx_ni(skb); 7308 input_queue_head_incr(oldsd); 7309 } 7310 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 7311 netif_rx_ni(skb); 7312 input_queue_head_incr(oldsd); 7313 } 7314 7315 return NOTIFY_OK; 7316 } 7317 7318 7319 /** 7320 * netdev_increment_features - increment feature set by one 7321 * @all: current feature set 7322 * @one: new feature set 7323 * @mask: mask feature set 7324 * 7325 * Computes a new feature set after adding a device with feature set 7326 * @one to the master device with current feature set @all. Will not 7327 * enable anything that is off in @mask. Returns the new feature set. 7328 */ 7329 netdev_features_t netdev_increment_features(netdev_features_t all, 7330 netdev_features_t one, netdev_features_t mask) 7331 { 7332 if (mask & NETIF_F_GEN_CSUM) 7333 mask |= NETIF_F_ALL_CSUM; 7334 mask |= NETIF_F_VLAN_CHALLENGED; 7335 7336 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; 7337 all &= one | ~NETIF_F_ALL_FOR_ALL; 7338 7339 /* If one device supports hw checksumming, set for all. */ 7340 if (all & NETIF_F_GEN_CSUM) 7341 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); 7342 7343 return all; 7344 } 7345 EXPORT_SYMBOL(netdev_increment_features); 7346 7347 static struct hlist_head * __net_init netdev_create_hash(void) 7348 { 7349 int i; 7350 struct hlist_head *hash; 7351 7352 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 7353 if (hash != NULL) 7354 for (i = 0; i < NETDEV_HASHENTRIES; i++) 7355 INIT_HLIST_HEAD(&hash[i]); 7356 7357 return hash; 7358 } 7359 7360 /* Initialize per network namespace state */ 7361 static int __net_init netdev_init(struct net *net) 7362 { 7363 if (net != &init_net) 7364 INIT_LIST_HEAD(&net->dev_base_head); 7365 7366 net->dev_name_head = netdev_create_hash(); 7367 if (net->dev_name_head == NULL) 7368 goto err_name; 7369 7370 net->dev_index_head = netdev_create_hash(); 7371 if (net->dev_index_head == NULL) 7372 goto err_idx; 7373 7374 return 0; 7375 7376 err_idx: 7377 kfree(net->dev_name_head); 7378 err_name: 7379 return -ENOMEM; 7380 } 7381 7382 /** 7383 * netdev_drivername - network driver for the device 7384 * @dev: network device 7385 * 7386 * Determine network driver for device. 7387 */ 7388 const char *netdev_drivername(const struct net_device *dev) 7389 { 7390 const struct device_driver *driver; 7391 const struct device *parent; 7392 const char *empty = ""; 7393 7394 parent = dev->dev.parent; 7395 if (!parent) 7396 return empty; 7397 7398 driver = parent->driver; 7399 if (driver && driver->name) 7400 return driver->name; 7401 return empty; 7402 } 7403 7404 static void __netdev_printk(const char *level, const struct net_device *dev, 7405 struct va_format *vaf) 7406 { 7407 if (dev && dev->dev.parent) { 7408 dev_printk_emit(level[1] - '0', 7409 dev->dev.parent, 7410 "%s %s %s%s: %pV", 7411 dev_driver_string(dev->dev.parent), 7412 dev_name(dev->dev.parent), 7413 netdev_name(dev), netdev_reg_state(dev), 7414 vaf); 7415 } else if (dev) { 7416 printk("%s%s%s: %pV", 7417 level, netdev_name(dev), netdev_reg_state(dev), vaf); 7418 } else { 7419 printk("%s(NULL net_device): %pV", level, vaf); 7420 } 7421 } 7422 7423 void netdev_printk(const char *level, const struct net_device *dev, 7424 const char *format, ...) 7425 { 7426 struct va_format vaf; 7427 va_list args; 7428 7429 va_start(args, format); 7430 7431 vaf.fmt = format; 7432 vaf.va = &args; 7433 7434 __netdev_printk(level, dev, &vaf); 7435 7436 va_end(args); 7437 } 7438 EXPORT_SYMBOL(netdev_printk); 7439 7440 #define define_netdev_printk_level(func, level) \ 7441 void func(const struct net_device *dev, const char *fmt, ...) \ 7442 { \ 7443 struct va_format vaf; \ 7444 va_list args; \ 7445 \ 7446 va_start(args, fmt); \ 7447 \ 7448 vaf.fmt = fmt; \ 7449 vaf.va = &args; \ 7450 \ 7451 __netdev_printk(level, dev, &vaf); \ 7452 \ 7453 va_end(args); \ 7454 } \ 7455 EXPORT_SYMBOL(func); 7456 7457 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 7458 define_netdev_printk_level(netdev_alert, KERN_ALERT); 7459 define_netdev_printk_level(netdev_crit, KERN_CRIT); 7460 define_netdev_printk_level(netdev_err, KERN_ERR); 7461 define_netdev_printk_level(netdev_warn, KERN_WARNING); 7462 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 7463 define_netdev_printk_level(netdev_info, KERN_INFO); 7464 7465 static void __net_exit netdev_exit(struct net *net) 7466 { 7467 kfree(net->dev_name_head); 7468 kfree(net->dev_index_head); 7469 } 7470 7471 static struct pernet_operations __net_initdata netdev_net_ops = { 7472 .init = netdev_init, 7473 .exit = netdev_exit, 7474 }; 7475 7476 static void __net_exit default_device_exit(struct net *net) 7477 { 7478 struct net_device *dev, *aux; 7479 /* 7480 * Push all migratable network devices back to the 7481 * initial network namespace 7482 */ 7483 rtnl_lock(); 7484 for_each_netdev_safe(net, dev, aux) { 7485 int err; 7486 char fb_name[IFNAMSIZ]; 7487 7488 /* Ignore unmoveable devices (i.e. loopback) */ 7489 if (dev->features & NETIF_F_NETNS_LOCAL) 7490 continue; 7491 7492 /* Leave virtual devices for the generic cleanup */ 7493 if (dev->rtnl_link_ops) 7494 continue; 7495 7496 /* Push remaining network devices to init_net */ 7497 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 7498 err = dev_change_net_namespace(dev, &init_net, fb_name); 7499 if (err) { 7500 pr_emerg("%s: failed to move %s to init_net: %d\n", 7501 __func__, dev->name, err); 7502 BUG(); 7503 } 7504 } 7505 rtnl_unlock(); 7506 } 7507 7508 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 7509 { 7510 /* Return with the rtnl_lock held when there are no network 7511 * devices unregistering in any network namespace in net_list. 7512 */ 7513 struct net *net; 7514 bool unregistering; 7515 DEFINE_WAIT_FUNC(wait, woken_wake_function); 7516 7517 add_wait_queue(&netdev_unregistering_wq, &wait); 7518 for (;;) { 7519 unregistering = false; 7520 rtnl_lock(); 7521 list_for_each_entry(net, net_list, exit_list) { 7522 if (net->dev_unreg_count > 0) { 7523 unregistering = true; 7524 break; 7525 } 7526 } 7527 if (!unregistering) 7528 break; 7529 __rtnl_unlock(); 7530 7531 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 7532 } 7533 remove_wait_queue(&netdev_unregistering_wq, &wait); 7534 } 7535 7536 static void __net_exit default_device_exit_batch(struct list_head *net_list) 7537 { 7538 /* At exit all network devices most be removed from a network 7539 * namespace. Do this in the reverse order of registration. 7540 * Do this across as many network namespaces as possible to 7541 * improve batching efficiency. 7542 */ 7543 struct net_device *dev; 7544 struct net *net; 7545 LIST_HEAD(dev_kill_list); 7546 7547 /* To prevent network device cleanup code from dereferencing 7548 * loopback devices or network devices that have been freed 7549 * wait here for all pending unregistrations to complete, 7550 * before unregistring the loopback device and allowing the 7551 * network namespace be freed. 7552 * 7553 * The netdev todo list containing all network devices 7554 * unregistrations that happen in default_device_exit_batch 7555 * will run in the rtnl_unlock() at the end of 7556 * default_device_exit_batch. 7557 */ 7558 rtnl_lock_unregistering(net_list); 7559 list_for_each_entry(net, net_list, exit_list) { 7560 for_each_netdev_reverse(net, dev) { 7561 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 7562 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 7563 else 7564 unregister_netdevice_queue(dev, &dev_kill_list); 7565 } 7566 } 7567 unregister_netdevice_many(&dev_kill_list); 7568 rtnl_unlock(); 7569 } 7570 7571 static struct pernet_operations __net_initdata default_device_ops = { 7572 .exit = default_device_exit, 7573 .exit_batch = default_device_exit_batch, 7574 }; 7575 7576 /* 7577 * Initialize the DEV module. At boot time this walks the device list and 7578 * unhooks any devices that fail to initialise (normally hardware not 7579 * present) and leaves us with a valid list of present and active devices. 7580 * 7581 */ 7582 7583 /* 7584 * This is called single threaded during boot, so no need 7585 * to take the rtnl semaphore. 7586 */ 7587 static int __init net_dev_init(void) 7588 { 7589 int i, rc = -ENOMEM; 7590 7591 BUG_ON(!dev_boot_phase); 7592 7593 if (dev_proc_init()) 7594 goto out; 7595 7596 if (netdev_kobject_init()) 7597 goto out; 7598 7599 INIT_LIST_HEAD(&ptype_all); 7600 for (i = 0; i < PTYPE_HASH_SIZE; i++) 7601 INIT_LIST_HEAD(&ptype_base[i]); 7602 7603 INIT_LIST_HEAD(&offload_base); 7604 7605 if (register_pernet_subsys(&netdev_net_ops)) 7606 goto out; 7607 7608 /* 7609 * Initialise the packet receive queues. 7610 */ 7611 7612 for_each_possible_cpu(i) { 7613 struct softnet_data *sd = &per_cpu(softnet_data, i); 7614 7615 skb_queue_head_init(&sd->input_pkt_queue); 7616 skb_queue_head_init(&sd->process_queue); 7617 INIT_LIST_HEAD(&sd->poll_list); 7618 sd->output_queue_tailp = &sd->output_queue; 7619 #ifdef CONFIG_RPS 7620 sd->csd.func = rps_trigger_softirq; 7621 sd->csd.info = sd; 7622 sd->cpu = i; 7623 #endif 7624 7625 sd->backlog.poll = process_backlog; 7626 sd->backlog.weight = weight_p; 7627 } 7628 7629 dev_boot_phase = 0; 7630 7631 /* The loopback device is special if any other network devices 7632 * is present in a network namespace the loopback device must 7633 * be present. Since we now dynamically allocate and free the 7634 * loopback device ensure this invariant is maintained by 7635 * keeping the loopback device as the first device on the 7636 * list of network devices. Ensuring the loopback devices 7637 * is the first device that appears and the last network device 7638 * that disappears. 7639 */ 7640 if (register_pernet_device(&loopback_net_ops)) 7641 goto out; 7642 7643 if (register_pernet_device(&default_device_ops)) 7644 goto out; 7645 7646 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 7647 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 7648 7649 hotcpu_notifier(dev_cpu_callback, 0); 7650 dst_init(); 7651 rc = 0; 7652 out: 7653 return rc; 7654 } 7655 7656 subsys_initcall(net_dev_init); 7657