1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <net/net_namespace.h> 98 #include <net/sock.h> 99 #include <net/busy_poll.h> 100 #include <linux/rtnetlink.h> 101 #include <linux/stat.h> 102 #include <net/dst.h> 103 #include <net/dst_metadata.h> 104 #include <net/pkt_sched.h> 105 #include <net/checksum.h> 106 #include <net/xfrm.h> 107 #include <linux/highmem.h> 108 #include <linux/init.h> 109 #include <linux/module.h> 110 #include <linux/netpoll.h> 111 #include <linux/rcupdate.h> 112 #include <linux/delay.h> 113 #include <net/iw_handler.h> 114 #include <asm/current.h> 115 #include <linux/audit.h> 116 #include <linux/dmaengine.h> 117 #include <linux/err.h> 118 #include <linux/ctype.h> 119 #include <linux/if_arp.h> 120 #include <linux/if_vlan.h> 121 #include <linux/ip.h> 122 #include <net/ip.h> 123 #include <net/mpls.h> 124 #include <linux/ipv6.h> 125 #include <linux/in.h> 126 #include <linux/jhash.h> 127 #include <linux/random.h> 128 #include <trace/events/napi.h> 129 #include <trace/events/net.h> 130 #include <trace/events/skb.h> 131 #include <linux/pci.h> 132 #include <linux/inetdevice.h> 133 #include <linux/cpu_rmap.h> 134 #include <linux/static_key.h> 135 #include <linux/hashtable.h> 136 #include <linux/vmalloc.h> 137 #include <linux/if_macvlan.h> 138 #include <linux/errqueue.h> 139 #include <linux/hrtimer.h> 140 #include <linux/netfilter_ingress.h> 141 #include <linux/sctp.h> 142 143 #include "net-sysfs.h" 144 145 /* Instead of increasing this, you should create a hash table. */ 146 #define MAX_GRO_SKBS 8 147 148 /* This should be increased if a protocol with a bigger head is added. */ 149 #define GRO_MAX_HEAD (MAX_HEADER + 128) 150 151 static DEFINE_SPINLOCK(ptype_lock); 152 static DEFINE_SPINLOCK(offload_lock); 153 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 154 struct list_head ptype_all __read_mostly; /* Taps */ 155 static struct list_head offload_base __read_mostly; 156 157 static int netif_rx_internal(struct sk_buff *skb); 158 static int call_netdevice_notifiers_info(unsigned long val, 159 struct net_device *dev, 160 struct netdev_notifier_info *info); 161 162 /* 163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 164 * semaphore. 165 * 166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 167 * 168 * Writers must hold the rtnl semaphore while they loop through the 169 * dev_base_head list, and hold dev_base_lock for writing when they do the 170 * actual updates. This allows pure readers to access the list even 171 * while a writer is preparing to update it. 172 * 173 * To put it another way, dev_base_lock is held for writing only to 174 * protect against pure readers; the rtnl semaphore provides the 175 * protection against other writers. 176 * 177 * See, for example usages, register_netdevice() and 178 * unregister_netdevice(), which must be called with the rtnl 179 * semaphore held. 180 */ 181 DEFINE_RWLOCK(dev_base_lock); 182 EXPORT_SYMBOL(dev_base_lock); 183 184 /* protects napi_hash addition/deletion and napi_gen_id */ 185 static DEFINE_SPINLOCK(napi_hash_lock); 186 187 static unsigned int napi_gen_id = NR_CPUS; 188 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); 189 190 static seqcount_t devnet_rename_seq; 191 192 static inline void dev_base_seq_inc(struct net *net) 193 { 194 while (++net->dev_base_seq == 0); 195 } 196 197 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 198 { 199 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 200 201 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 202 } 203 204 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 205 { 206 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 207 } 208 209 static inline void rps_lock(struct softnet_data *sd) 210 { 211 #ifdef CONFIG_RPS 212 spin_lock(&sd->input_pkt_queue.lock); 213 #endif 214 } 215 216 static inline void rps_unlock(struct softnet_data *sd) 217 { 218 #ifdef CONFIG_RPS 219 spin_unlock(&sd->input_pkt_queue.lock); 220 #endif 221 } 222 223 /* Device list insertion */ 224 static void list_netdevice(struct net_device *dev) 225 { 226 struct net *net = dev_net(dev); 227 228 ASSERT_RTNL(); 229 230 write_lock_bh(&dev_base_lock); 231 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 232 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 233 hlist_add_head_rcu(&dev->index_hlist, 234 dev_index_hash(net, dev->ifindex)); 235 write_unlock_bh(&dev_base_lock); 236 237 dev_base_seq_inc(net); 238 } 239 240 /* Device list removal 241 * caller must respect a RCU grace period before freeing/reusing dev 242 */ 243 static void unlist_netdevice(struct net_device *dev) 244 { 245 ASSERT_RTNL(); 246 247 /* Unlink dev from the device chain */ 248 write_lock_bh(&dev_base_lock); 249 list_del_rcu(&dev->dev_list); 250 hlist_del_rcu(&dev->name_hlist); 251 hlist_del_rcu(&dev->index_hlist); 252 write_unlock_bh(&dev_base_lock); 253 254 dev_base_seq_inc(dev_net(dev)); 255 } 256 257 /* 258 * Our notifier list 259 */ 260 261 static RAW_NOTIFIER_HEAD(netdev_chain); 262 263 /* 264 * Device drivers call our routines to queue packets here. We empty the 265 * queue in the local softnet handler. 266 */ 267 268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 269 EXPORT_PER_CPU_SYMBOL(softnet_data); 270 271 #ifdef CONFIG_LOCKDEP 272 /* 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 274 * according to dev->type 275 */ 276 static const unsigned short netdev_lock_type[] = 277 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 278 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 279 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 280 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 281 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 282 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 283 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 284 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 285 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 286 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 287 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 288 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 289 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 290 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 291 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 292 293 static const char *const netdev_lock_name[] = 294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 306 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 307 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 308 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 309 310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 312 313 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 314 { 315 int i; 316 317 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 318 if (netdev_lock_type[i] == dev_type) 319 return i; 320 /* the last key is used by default */ 321 return ARRAY_SIZE(netdev_lock_type) - 1; 322 } 323 324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 325 unsigned short dev_type) 326 { 327 int i; 328 329 i = netdev_lock_pos(dev_type); 330 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 331 netdev_lock_name[i]); 332 } 333 334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 335 { 336 int i; 337 338 i = netdev_lock_pos(dev->type); 339 lockdep_set_class_and_name(&dev->addr_list_lock, 340 &netdev_addr_lock_key[i], 341 netdev_lock_name[i]); 342 } 343 #else 344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 345 unsigned short dev_type) 346 { 347 } 348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 349 { 350 } 351 #endif 352 353 /******************************************************************************* 354 355 Protocol management and registration routines 356 357 *******************************************************************************/ 358 359 /* 360 * Add a protocol ID to the list. Now that the input handler is 361 * smarter we can dispense with all the messy stuff that used to be 362 * here. 363 * 364 * BEWARE!!! Protocol handlers, mangling input packets, 365 * MUST BE last in hash buckets and checking protocol handlers 366 * MUST start from promiscuous ptype_all chain in net_bh. 367 * It is true now, do not change it. 368 * Explanation follows: if protocol handler, mangling packet, will 369 * be the first on list, it is not able to sense, that packet 370 * is cloned and should be copied-on-write, so that it will 371 * change it and subsequent readers will get broken packet. 372 * --ANK (980803) 373 */ 374 375 static inline struct list_head *ptype_head(const struct packet_type *pt) 376 { 377 if (pt->type == htons(ETH_P_ALL)) 378 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 379 else 380 return pt->dev ? &pt->dev->ptype_specific : 381 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 382 } 383 384 /** 385 * dev_add_pack - add packet handler 386 * @pt: packet type declaration 387 * 388 * Add a protocol handler to the networking stack. The passed &packet_type 389 * is linked into kernel lists and may not be freed until it has been 390 * removed from the kernel lists. 391 * 392 * This call does not sleep therefore it can not 393 * guarantee all CPU's that are in middle of receiving packets 394 * will see the new packet type (until the next received packet). 395 */ 396 397 void dev_add_pack(struct packet_type *pt) 398 { 399 struct list_head *head = ptype_head(pt); 400 401 spin_lock(&ptype_lock); 402 list_add_rcu(&pt->list, head); 403 spin_unlock(&ptype_lock); 404 } 405 EXPORT_SYMBOL(dev_add_pack); 406 407 /** 408 * __dev_remove_pack - remove packet handler 409 * @pt: packet type declaration 410 * 411 * Remove a protocol handler that was previously added to the kernel 412 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 413 * from the kernel lists and can be freed or reused once this function 414 * returns. 415 * 416 * The packet type might still be in use by receivers 417 * and must not be freed until after all the CPU's have gone 418 * through a quiescent state. 419 */ 420 void __dev_remove_pack(struct packet_type *pt) 421 { 422 struct list_head *head = ptype_head(pt); 423 struct packet_type *pt1; 424 425 spin_lock(&ptype_lock); 426 427 list_for_each_entry(pt1, head, list) { 428 if (pt == pt1) { 429 list_del_rcu(&pt->list); 430 goto out; 431 } 432 } 433 434 pr_warn("dev_remove_pack: %p not found\n", pt); 435 out: 436 spin_unlock(&ptype_lock); 437 } 438 EXPORT_SYMBOL(__dev_remove_pack); 439 440 /** 441 * dev_remove_pack - remove packet handler 442 * @pt: packet type declaration 443 * 444 * Remove a protocol handler that was previously added to the kernel 445 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 446 * from the kernel lists and can be freed or reused once this function 447 * returns. 448 * 449 * This call sleeps to guarantee that no CPU is looking at the packet 450 * type after return. 451 */ 452 void dev_remove_pack(struct packet_type *pt) 453 { 454 __dev_remove_pack(pt); 455 456 synchronize_net(); 457 } 458 EXPORT_SYMBOL(dev_remove_pack); 459 460 461 /** 462 * dev_add_offload - register offload handlers 463 * @po: protocol offload declaration 464 * 465 * Add protocol offload handlers to the networking stack. The passed 466 * &proto_offload is linked into kernel lists and may not be freed until 467 * it has been removed from the kernel lists. 468 * 469 * This call does not sleep therefore it can not 470 * guarantee all CPU's that are in middle of receiving packets 471 * will see the new offload handlers (until the next received packet). 472 */ 473 void dev_add_offload(struct packet_offload *po) 474 { 475 struct packet_offload *elem; 476 477 spin_lock(&offload_lock); 478 list_for_each_entry(elem, &offload_base, list) { 479 if (po->priority < elem->priority) 480 break; 481 } 482 list_add_rcu(&po->list, elem->list.prev); 483 spin_unlock(&offload_lock); 484 } 485 EXPORT_SYMBOL(dev_add_offload); 486 487 /** 488 * __dev_remove_offload - remove offload handler 489 * @po: packet offload declaration 490 * 491 * Remove a protocol offload handler that was previously added to the 492 * kernel offload handlers by dev_add_offload(). The passed &offload_type 493 * is removed from the kernel lists and can be freed or reused once this 494 * function returns. 495 * 496 * The packet type might still be in use by receivers 497 * and must not be freed until after all the CPU's have gone 498 * through a quiescent state. 499 */ 500 static void __dev_remove_offload(struct packet_offload *po) 501 { 502 struct list_head *head = &offload_base; 503 struct packet_offload *po1; 504 505 spin_lock(&offload_lock); 506 507 list_for_each_entry(po1, head, list) { 508 if (po == po1) { 509 list_del_rcu(&po->list); 510 goto out; 511 } 512 } 513 514 pr_warn("dev_remove_offload: %p not found\n", po); 515 out: 516 spin_unlock(&offload_lock); 517 } 518 519 /** 520 * dev_remove_offload - remove packet offload handler 521 * @po: packet offload declaration 522 * 523 * Remove a packet offload handler that was previously added to the kernel 524 * offload handlers by dev_add_offload(). The passed &offload_type is 525 * removed from the kernel lists and can be freed or reused once this 526 * function returns. 527 * 528 * This call sleeps to guarantee that no CPU is looking at the packet 529 * type after return. 530 */ 531 void dev_remove_offload(struct packet_offload *po) 532 { 533 __dev_remove_offload(po); 534 535 synchronize_net(); 536 } 537 EXPORT_SYMBOL(dev_remove_offload); 538 539 /****************************************************************************** 540 541 Device Boot-time Settings Routines 542 543 *******************************************************************************/ 544 545 /* Boot time configuration table */ 546 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 547 548 /** 549 * netdev_boot_setup_add - add new setup entry 550 * @name: name of the device 551 * @map: configured settings for the device 552 * 553 * Adds new setup entry to the dev_boot_setup list. The function 554 * returns 0 on error and 1 on success. This is a generic routine to 555 * all netdevices. 556 */ 557 static int netdev_boot_setup_add(char *name, struct ifmap *map) 558 { 559 struct netdev_boot_setup *s; 560 int i; 561 562 s = dev_boot_setup; 563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 564 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 565 memset(s[i].name, 0, sizeof(s[i].name)); 566 strlcpy(s[i].name, name, IFNAMSIZ); 567 memcpy(&s[i].map, map, sizeof(s[i].map)); 568 break; 569 } 570 } 571 572 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 573 } 574 575 /** 576 * netdev_boot_setup_check - check boot time settings 577 * @dev: the netdevice 578 * 579 * Check boot time settings for the device. 580 * The found settings are set for the device to be used 581 * later in the device probing. 582 * Returns 0 if no settings found, 1 if they are. 583 */ 584 int netdev_boot_setup_check(struct net_device *dev) 585 { 586 struct netdev_boot_setup *s = dev_boot_setup; 587 int i; 588 589 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 590 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 591 !strcmp(dev->name, s[i].name)) { 592 dev->irq = s[i].map.irq; 593 dev->base_addr = s[i].map.base_addr; 594 dev->mem_start = s[i].map.mem_start; 595 dev->mem_end = s[i].map.mem_end; 596 return 1; 597 } 598 } 599 return 0; 600 } 601 EXPORT_SYMBOL(netdev_boot_setup_check); 602 603 604 /** 605 * netdev_boot_base - get address from boot time settings 606 * @prefix: prefix for network device 607 * @unit: id for network device 608 * 609 * Check boot time settings for the base address of device. 610 * The found settings are set for the device to be used 611 * later in the device probing. 612 * Returns 0 if no settings found. 613 */ 614 unsigned long netdev_boot_base(const char *prefix, int unit) 615 { 616 const struct netdev_boot_setup *s = dev_boot_setup; 617 char name[IFNAMSIZ]; 618 int i; 619 620 sprintf(name, "%s%d", prefix, unit); 621 622 /* 623 * If device already registered then return base of 1 624 * to indicate not to probe for this interface 625 */ 626 if (__dev_get_by_name(&init_net, name)) 627 return 1; 628 629 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 630 if (!strcmp(name, s[i].name)) 631 return s[i].map.base_addr; 632 return 0; 633 } 634 635 /* 636 * Saves at boot time configured settings for any netdevice. 637 */ 638 int __init netdev_boot_setup(char *str) 639 { 640 int ints[5]; 641 struct ifmap map; 642 643 str = get_options(str, ARRAY_SIZE(ints), ints); 644 if (!str || !*str) 645 return 0; 646 647 /* Save settings */ 648 memset(&map, 0, sizeof(map)); 649 if (ints[0] > 0) 650 map.irq = ints[1]; 651 if (ints[0] > 1) 652 map.base_addr = ints[2]; 653 if (ints[0] > 2) 654 map.mem_start = ints[3]; 655 if (ints[0] > 3) 656 map.mem_end = ints[4]; 657 658 /* Add new entry to the list */ 659 return netdev_boot_setup_add(str, &map); 660 } 661 662 __setup("netdev=", netdev_boot_setup); 663 664 /******************************************************************************* 665 666 Device Interface Subroutines 667 668 *******************************************************************************/ 669 670 /** 671 * dev_get_iflink - get 'iflink' value of a interface 672 * @dev: targeted interface 673 * 674 * Indicates the ifindex the interface is linked to. 675 * Physical interfaces have the same 'ifindex' and 'iflink' values. 676 */ 677 678 int dev_get_iflink(const struct net_device *dev) 679 { 680 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) 681 return dev->netdev_ops->ndo_get_iflink(dev); 682 683 return dev->ifindex; 684 } 685 EXPORT_SYMBOL(dev_get_iflink); 686 687 /** 688 * dev_fill_metadata_dst - Retrieve tunnel egress information. 689 * @dev: targeted interface 690 * @skb: The packet. 691 * 692 * For better visibility of tunnel traffic OVS needs to retrieve 693 * egress tunnel information for a packet. Following API allows 694 * user to get this info. 695 */ 696 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 697 { 698 struct ip_tunnel_info *info; 699 700 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) 701 return -EINVAL; 702 703 info = skb_tunnel_info_unclone(skb); 704 if (!info) 705 return -ENOMEM; 706 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) 707 return -EINVAL; 708 709 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); 710 } 711 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); 712 713 /** 714 * __dev_get_by_name - find a device by its name 715 * @net: the applicable net namespace 716 * @name: name to find 717 * 718 * Find an interface by name. Must be called under RTNL semaphore 719 * or @dev_base_lock. If the name is found a pointer to the device 720 * is returned. If the name is not found then %NULL is returned. The 721 * reference counters are not incremented so the caller must be 722 * careful with locks. 723 */ 724 725 struct net_device *__dev_get_by_name(struct net *net, const char *name) 726 { 727 struct net_device *dev; 728 struct hlist_head *head = dev_name_hash(net, name); 729 730 hlist_for_each_entry(dev, head, name_hlist) 731 if (!strncmp(dev->name, name, IFNAMSIZ)) 732 return dev; 733 734 return NULL; 735 } 736 EXPORT_SYMBOL(__dev_get_by_name); 737 738 /** 739 * dev_get_by_name_rcu - find a device by its name 740 * @net: the applicable net namespace 741 * @name: name to find 742 * 743 * Find an interface by name. 744 * If the name is found a pointer to the device is returned. 745 * If the name is not found then %NULL is returned. 746 * The reference counters are not incremented so the caller must be 747 * careful with locks. The caller must hold RCU lock. 748 */ 749 750 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 751 { 752 struct net_device *dev; 753 struct hlist_head *head = dev_name_hash(net, name); 754 755 hlist_for_each_entry_rcu(dev, head, name_hlist) 756 if (!strncmp(dev->name, name, IFNAMSIZ)) 757 return dev; 758 759 return NULL; 760 } 761 EXPORT_SYMBOL(dev_get_by_name_rcu); 762 763 /** 764 * dev_get_by_name - find a device by its name 765 * @net: the applicable net namespace 766 * @name: name to find 767 * 768 * Find an interface by name. This can be called from any 769 * context and does its own locking. The returned handle has 770 * the usage count incremented and the caller must use dev_put() to 771 * release it when it is no longer needed. %NULL is returned if no 772 * matching device is found. 773 */ 774 775 struct net_device *dev_get_by_name(struct net *net, const char *name) 776 { 777 struct net_device *dev; 778 779 rcu_read_lock(); 780 dev = dev_get_by_name_rcu(net, name); 781 if (dev) 782 dev_hold(dev); 783 rcu_read_unlock(); 784 return dev; 785 } 786 EXPORT_SYMBOL(dev_get_by_name); 787 788 /** 789 * __dev_get_by_index - find a device by its ifindex 790 * @net: the applicable net namespace 791 * @ifindex: index of device 792 * 793 * Search for an interface by index. Returns %NULL if the device 794 * is not found or a pointer to the device. The device has not 795 * had its reference counter increased so the caller must be careful 796 * about locking. The caller must hold either the RTNL semaphore 797 * or @dev_base_lock. 798 */ 799 800 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 801 { 802 struct net_device *dev; 803 struct hlist_head *head = dev_index_hash(net, ifindex); 804 805 hlist_for_each_entry(dev, head, index_hlist) 806 if (dev->ifindex == ifindex) 807 return dev; 808 809 return NULL; 810 } 811 EXPORT_SYMBOL(__dev_get_by_index); 812 813 /** 814 * dev_get_by_index_rcu - find a device by its ifindex 815 * @net: the applicable net namespace 816 * @ifindex: index of device 817 * 818 * Search for an interface by index. Returns %NULL if the device 819 * is not found or a pointer to the device. The device has not 820 * had its reference counter increased so the caller must be careful 821 * about locking. The caller must hold RCU lock. 822 */ 823 824 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 825 { 826 struct net_device *dev; 827 struct hlist_head *head = dev_index_hash(net, ifindex); 828 829 hlist_for_each_entry_rcu(dev, head, index_hlist) 830 if (dev->ifindex == ifindex) 831 return dev; 832 833 return NULL; 834 } 835 EXPORT_SYMBOL(dev_get_by_index_rcu); 836 837 838 /** 839 * dev_get_by_index - find a device by its ifindex 840 * @net: the applicable net namespace 841 * @ifindex: index of device 842 * 843 * Search for an interface by index. Returns NULL if the device 844 * is not found or a pointer to the device. The device returned has 845 * had a reference added and the pointer is safe until the user calls 846 * dev_put to indicate they have finished with it. 847 */ 848 849 struct net_device *dev_get_by_index(struct net *net, int ifindex) 850 { 851 struct net_device *dev; 852 853 rcu_read_lock(); 854 dev = dev_get_by_index_rcu(net, ifindex); 855 if (dev) 856 dev_hold(dev); 857 rcu_read_unlock(); 858 return dev; 859 } 860 EXPORT_SYMBOL(dev_get_by_index); 861 862 /** 863 * netdev_get_name - get a netdevice name, knowing its ifindex. 864 * @net: network namespace 865 * @name: a pointer to the buffer where the name will be stored. 866 * @ifindex: the ifindex of the interface to get the name from. 867 * 868 * The use of raw_seqcount_begin() and cond_resched() before 869 * retrying is required as we want to give the writers a chance 870 * to complete when CONFIG_PREEMPT is not set. 871 */ 872 int netdev_get_name(struct net *net, char *name, int ifindex) 873 { 874 struct net_device *dev; 875 unsigned int seq; 876 877 retry: 878 seq = raw_seqcount_begin(&devnet_rename_seq); 879 rcu_read_lock(); 880 dev = dev_get_by_index_rcu(net, ifindex); 881 if (!dev) { 882 rcu_read_unlock(); 883 return -ENODEV; 884 } 885 886 strcpy(name, dev->name); 887 rcu_read_unlock(); 888 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 889 cond_resched(); 890 goto retry; 891 } 892 893 return 0; 894 } 895 896 /** 897 * dev_getbyhwaddr_rcu - find a device by its hardware address 898 * @net: the applicable net namespace 899 * @type: media type of device 900 * @ha: hardware address 901 * 902 * Search for an interface by MAC address. Returns NULL if the device 903 * is not found or a pointer to the device. 904 * The caller must hold RCU or RTNL. 905 * The returned device has not had its ref count increased 906 * and the caller must therefore be careful about locking 907 * 908 */ 909 910 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 911 const char *ha) 912 { 913 struct net_device *dev; 914 915 for_each_netdev_rcu(net, dev) 916 if (dev->type == type && 917 !memcmp(dev->dev_addr, ha, dev->addr_len)) 918 return dev; 919 920 return NULL; 921 } 922 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 923 924 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 925 { 926 struct net_device *dev; 927 928 ASSERT_RTNL(); 929 for_each_netdev(net, dev) 930 if (dev->type == type) 931 return dev; 932 933 return NULL; 934 } 935 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 936 937 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 938 { 939 struct net_device *dev, *ret = NULL; 940 941 rcu_read_lock(); 942 for_each_netdev_rcu(net, dev) 943 if (dev->type == type) { 944 dev_hold(dev); 945 ret = dev; 946 break; 947 } 948 rcu_read_unlock(); 949 return ret; 950 } 951 EXPORT_SYMBOL(dev_getfirstbyhwtype); 952 953 /** 954 * __dev_get_by_flags - find any device with given flags 955 * @net: the applicable net namespace 956 * @if_flags: IFF_* values 957 * @mask: bitmask of bits in if_flags to check 958 * 959 * Search for any interface with the given flags. Returns NULL if a device 960 * is not found or a pointer to the device. Must be called inside 961 * rtnl_lock(), and result refcount is unchanged. 962 */ 963 964 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 965 unsigned short mask) 966 { 967 struct net_device *dev, *ret; 968 969 ASSERT_RTNL(); 970 971 ret = NULL; 972 for_each_netdev(net, dev) { 973 if (((dev->flags ^ if_flags) & mask) == 0) { 974 ret = dev; 975 break; 976 } 977 } 978 return ret; 979 } 980 EXPORT_SYMBOL(__dev_get_by_flags); 981 982 /** 983 * dev_valid_name - check if name is okay for network device 984 * @name: name string 985 * 986 * Network device names need to be valid file names to 987 * to allow sysfs to work. We also disallow any kind of 988 * whitespace. 989 */ 990 bool dev_valid_name(const char *name) 991 { 992 if (*name == '\0') 993 return false; 994 if (strlen(name) >= IFNAMSIZ) 995 return false; 996 if (!strcmp(name, ".") || !strcmp(name, "..")) 997 return false; 998 999 while (*name) { 1000 if (*name == '/' || *name == ':' || isspace(*name)) 1001 return false; 1002 name++; 1003 } 1004 return true; 1005 } 1006 EXPORT_SYMBOL(dev_valid_name); 1007 1008 /** 1009 * __dev_alloc_name - allocate a name for a device 1010 * @net: network namespace to allocate the device name in 1011 * @name: name format string 1012 * @buf: scratch buffer and result name string 1013 * 1014 * Passed a format string - eg "lt%d" it will try and find a suitable 1015 * id. It scans list of devices to build up a free map, then chooses 1016 * the first empty slot. The caller must hold the dev_base or rtnl lock 1017 * while allocating the name and adding the device in order to avoid 1018 * duplicates. 1019 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1020 * Returns the number of the unit assigned or a negative errno code. 1021 */ 1022 1023 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 1024 { 1025 int i = 0; 1026 const char *p; 1027 const int max_netdevices = 8*PAGE_SIZE; 1028 unsigned long *inuse; 1029 struct net_device *d; 1030 1031 p = strnchr(name, IFNAMSIZ-1, '%'); 1032 if (p) { 1033 /* 1034 * Verify the string as this thing may have come from 1035 * the user. There must be either one "%d" and no other "%" 1036 * characters. 1037 */ 1038 if (p[1] != 'd' || strchr(p + 2, '%')) 1039 return -EINVAL; 1040 1041 /* Use one page as a bit array of possible slots */ 1042 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 1043 if (!inuse) 1044 return -ENOMEM; 1045 1046 for_each_netdev(net, d) { 1047 if (!sscanf(d->name, name, &i)) 1048 continue; 1049 if (i < 0 || i >= max_netdevices) 1050 continue; 1051 1052 /* avoid cases where sscanf is not exact inverse of printf */ 1053 snprintf(buf, IFNAMSIZ, name, i); 1054 if (!strncmp(buf, d->name, IFNAMSIZ)) 1055 set_bit(i, inuse); 1056 } 1057 1058 i = find_first_zero_bit(inuse, max_netdevices); 1059 free_page((unsigned long) inuse); 1060 } 1061 1062 if (buf != name) 1063 snprintf(buf, IFNAMSIZ, name, i); 1064 if (!__dev_get_by_name(net, buf)) 1065 return i; 1066 1067 /* It is possible to run out of possible slots 1068 * when the name is long and there isn't enough space left 1069 * for the digits, or if all bits are used. 1070 */ 1071 return -ENFILE; 1072 } 1073 1074 /** 1075 * dev_alloc_name - allocate a name for a device 1076 * @dev: device 1077 * @name: name format string 1078 * 1079 * Passed a format string - eg "lt%d" it will try and find a suitable 1080 * id. It scans list of devices to build up a free map, then chooses 1081 * the first empty slot. The caller must hold the dev_base or rtnl lock 1082 * while allocating the name and adding the device in order to avoid 1083 * duplicates. 1084 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1085 * Returns the number of the unit assigned or a negative errno code. 1086 */ 1087 1088 int dev_alloc_name(struct net_device *dev, const char *name) 1089 { 1090 char buf[IFNAMSIZ]; 1091 struct net *net; 1092 int ret; 1093 1094 BUG_ON(!dev_net(dev)); 1095 net = dev_net(dev); 1096 ret = __dev_alloc_name(net, name, buf); 1097 if (ret >= 0) 1098 strlcpy(dev->name, buf, IFNAMSIZ); 1099 return ret; 1100 } 1101 EXPORT_SYMBOL(dev_alloc_name); 1102 1103 static int dev_alloc_name_ns(struct net *net, 1104 struct net_device *dev, 1105 const char *name) 1106 { 1107 char buf[IFNAMSIZ]; 1108 int ret; 1109 1110 ret = __dev_alloc_name(net, name, buf); 1111 if (ret >= 0) 1112 strlcpy(dev->name, buf, IFNAMSIZ); 1113 return ret; 1114 } 1115 1116 static int dev_get_valid_name(struct net *net, 1117 struct net_device *dev, 1118 const char *name) 1119 { 1120 BUG_ON(!net); 1121 1122 if (!dev_valid_name(name)) 1123 return -EINVAL; 1124 1125 if (strchr(name, '%')) 1126 return dev_alloc_name_ns(net, dev, name); 1127 else if (__dev_get_by_name(net, name)) 1128 return -EEXIST; 1129 else if (dev->name != name) 1130 strlcpy(dev->name, name, IFNAMSIZ); 1131 1132 return 0; 1133 } 1134 1135 /** 1136 * dev_change_name - change name of a device 1137 * @dev: device 1138 * @newname: name (or format string) must be at least IFNAMSIZ 1139 * 1140 * Change name of a device, can pass format strings "eth%d". 1141 * for wildcarding. 1142 */ 1143 int dev_change_name(struct net_device *dev, const char *newname) 1144 { 1145 unsigned char old_assign_type; 1146 char oldname[IFNAMSIZ]; 1147 int err = 0; 1148 int ret; 1149 struct net *net; 1150 1151 ASSERT_RTNL(); 1152 BUG_ON(!dev_net(dev)); 1153 1154 net = dev_net(dev); 1155 if (dev->flags & IFF_UP) 1156 return -EBUSY; 1157 1158 write_seqcount_begin(&devnet_rename_seq); 1159 1160 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1161 write_seqcount_end(&devnet_rename_seq); 1162 return 0; 1163 } 1164 1165 memcpy(oldname, dev->name, IFNAMSIZ); 1166 1167 err = dev_get_valid_name(net, dev, newname); 1168 if (err < 0) { 1169 write_seqcount_end(&devnet_rename_seq); 1170 return err; 1171 } 1172 1173 if (oldname[0] && !strchr(oldname, '%')) 1174 netdev_info(dev, "renamed from %s\n", oldname); 1175 1176 old_assign_type = dev->name_assign_type; 1177 dev->name_assign_type = NET_NAME_RENAMED; 1178 1179 rollback: 1180 ret = device_rename(&dev->dev, dev->name); 1181 if (ret) { 1182 memcpy(dev->name, oldname, IFNAMSIZ); 1183 dev->name_assign_type = old_assign_type; 1184 write_seqcount_end(&devnet_rename_seq); 1185 return ret; 1186 } 1187 1188 write_seqcount_end(&devnet_rename_seq); 1189 1190 netdev_adjacent_rename_links(dev, oldname); 1191 1192 write_lock_bh(&dev_base_lock); 1193 hlist_del_rcu(&dev->name_hlist); 1194 write_unlock_bh(&dev_base_lock); 1195 1196 synchronize_rcu(); 1197 1198 write_lock_bh(&dev_base_lock); 1199 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1200 write_unlock_bh(&dev_base_lock); 1201 1202 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1203 ret = notifier_to_errno(ret); 1204 1205 if (ret) { 1206 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1207 if (err >= 0) { 1208 err = ret; 1209 write_seqcount_begin(&devnet_rename_seq); 1210 memcpy(dev->name, oldname, IFNAMSIZ); 1211 memcpy(oldname, newname, IFNAMSIZ); 1212 dev->name_assign_type = old_assign_type; 1213 old_assign_type = NET_NAME_RENAMED; 1214 goto rollback; 1215 } else { 1216 pr_err("%s: name change rollback failed: %d\n", 1217 dev->name, ret); 1218 } 1219 } 1220 1221 return err; 1222 } 1223 1224 /** 1225 * dev_set_alias - change ifalias of a device 1226 * @dev: device 1227 * @alias: name up to IFALIASZ 1228 * @len: limit of bytes to copy from info 1229 * 1230 * Set ifalias for a device, 1231 */ 1232 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1233 { 1234 char *new_ifalias; 1235 1236 ASSERT_RTNL(); 1237 1238 if (len >= IFALIASZ) 1239 return -EINVAL; 1240 1241 if (!len) { 1242 kfree(dev->ifalias); 1243 dev->ifalias = NULL; 1244 return 0; 1245 } 1246 1247 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1248 if (!new_ifalias) 1249 return -ENOMEM; 1250 dev->ifalias = new_ifalias; 1251 1252 strlcpy(dev->ifalias, alias, len+1); 1253 return len; 1254 } 1255 1256 1257 /** 1258 * netdev_features_change - device changes features 1259 * @dev: device to cause notification 1260 * 1261 * Called to indicate a device has changed features. 1262 */ 1263 void netdev_features_change(struct net_device *dev) 1264 { 1265 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1266 } 1267 EXPORT_SYMBOL(netdev_features_change); 1268 1269 /** 1270 * netdev_state_change - device changes state 1271 * @dev: device to cause notification 1272 * 1273 * Called to indicate a device has changed state. This function calls 1274 * the notifier chains for netdev_chain and sends a NEWLINK message 1275 * to the routing socket. 1276 */ 1277 void netdev_state_change(struct net_device *dev) 1278 { 1279 if (dev->flags & IFF_UP) { 1280 struct netdev_notifier_change_info change_info; 1281 1282 change_info.flags_changed = 0; 1283 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1284 &change_info.info); 1285 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1286 } 1287 } 1288 EXPORT_SYMBOL(netdev_state_change); 1289 1290 /** 1291 * netdev_notify_peers - notify network peers about existence of @dev 1292 * @dev: network device 1293 * 1294 * Generate traffic such that interested network peers are aware of 1295 * @dev, such as by generating a gratuitous ARP. This may be used when 1296 * a device wants to inform the rest of the network about some sort of 1297 * reconfiguration such as a failover event or virtual machine 1298 * migration. 1299 */ 1300 void netdev_notify_peers(struct net_device *dev) 1301 { 1302 rtnl_lock(); 1303 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1304 rtnl_unlock(); 1305 } 1306 EXPORT_SYMBOL(netdev_notify_peers); 1307 1308 static int __dev_open(struct net_device *dev) 1309 { 1310 const struct net_device_ops *ops = dev->netdev_ops; 1311 int ret; 1312 1313 ASSERT_RTNL(); 1314 1315 if (!netif_device_present(dev)) 1316 return -ENODEV; 1317 1318 /* Block netpoll from trying to do any rx path servicing. 1319 * If we don't do this there is a chance ndo_poll_controller 1320 * or ndo_poll may be running while we open the device 1321 */ 1322 netpoll_poll_disable(dev); 1323 1324 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1325 ret = notifier_to_errno(ret); 1326 if (ret) 1327 return ret; 1328 1329 set_bit(__LINK_STATE_START, &dev->state); 1330 1331 if (ops->ndo_validate_addr) 1332 ret = ops->ndo_validate_addr(dev); 1333 1334 if (!ret && ops->ndo_open) 1335 ret = ops->ndo_open(dev); 1336 1337 netpoll_poll_enable(dev); 1338 1339 if (ret) 1340 clear_bit(__LINK_STATE_START, &dev->state); 1341 else { 1342 dev->flags |= IFF_UP; 1343 dev_set_rx_mode(dev); 1344 dev_activate(dev); 1345 add_device_randomness(dev->dev_addr, dev->addr_len); 1346 } 1347 1348 return ret; 1349 } 1350 1351 /** 1352 * dev_open - prepare an interface for use. 1353 * @dev: device to open 1354 * 1355 * Takes a device from down to up state. The device's private open 1356 * function is invoked and then the multicast lists are loaded. Finally 1357 * the device is moved into the up state and a %NETDEV_UP message is 1358 * sent to the netdev notifier chain. 1359 * 1360 * Calling this function on an active interface is a nop. On a failure 1361 * a negative errno code is returned. 1362 */ 1363 int dev_open(struct net_device *dev) 1364 { 1365 int ret; 1366 1367 if (dev->flags & IFF_UP) 1368 return 0; 1369 1370 ret = __dev_open(dev); 1371 if (ret < 0) 1372 return ret; 1373 1374 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1375 call_netdevice_notifiers(NETDEV_UP, dev); 1376 1377 return ret; 1378 } 1379 EXPORT_SYMBOL(dev_open); 1380 1381 static int __dev_close_many(struct list_head *head) 1382 { 1383 struct net_device *dev; 1384 1385 ASSERT_RTNL(); 1386 might_sleep(); 1387 1388 list_for_each_entry(dev, head, close_list) { 1389 /* Temporarily disable netpoll until the interface is down */ 1390 netpoll_poll_disable(dev); 1391 1392 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1393 1394 clear_bit(__LINK_STATE_START, &dev->state); 1395 1396 /* Synchronize to scheduled poll. We cannot touch poll list, it 1397 * can be even on different cpu. So just clear netif_running(). 1398 * 1399 * dev->stop() will invoke napi_disable() on all of it's 1400 * napi_struct instances on this device. 1401 */ 1402 smp_mb__after_atomic(); /* Commit netif_running(). */ 1403 } 1404 1405 dev_deactivate_many(head); 1406 1407 list_for_each_entry(dev, head, close_list) { 1408 const struct net_device_ops *ops = dev->netdev_ops; 1409 1410 /* 1411 * Call the device specific close. This cannot fail. 1412 * Only if device is UP 1413 * 1414 * We allow it to be called even after a DETACH hot-plug 1415 * event. 1416 */ 1417 if (ops->ndo_stop) 1418 ops->ndo_stop(dev); 1419 1420 dev->flags &= ~IFF_UP; 1421 netpoll_poll_enable(dev); 1422 } 1423 1424 return 0; 1425 } 1426 1427 static int __dev_close(struct net_device *dev) 1428 { 1429 int retval; 1430 LIST_HEAD(single); 1431 1432 list_add(&dev->close_list, &single); 1433 retval = __dev_close_many(&single); 1434 list_del(&single); 1435 1436 return retval; 1437 } 1438 1439 int dev_close_many(struct list_head *head, bool unlink) 1440 { 1441 struct net_device *dev, *tmp; 1442 1443 /* Remove the devices that don't need to be closed */ 1444 list_for_each_entry_safe(dev, tmp, head, close_list) 1445 if (!(dev->flags & IFF_UP)) 1446 list_del_init(&dev->close_list); 1447 1448 __dev_close_many(head); 1449 1450 list_for_each_entry_safe(dev, tmp, head, close_list) { 1451 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1452 call_netdevice_notifiers(NETDEV_DOWN, dev); 1453 if (unlink) 1454 list_del_init(&dev->close_list); 1455 } 1456 1457 return 0; 1458 } 1459 EXPORT_SYMBOL(dev_close_many); 1460 1461 /** 1462 * dev_close - shutdown an interface. 1463 * @dev: device to shutdown 1464 * 1465 * This function moves an active device into down state. A 1466 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1467 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1468 * chain. 1469 */ 1470 int dev_close(struct net_device *dev) 1471 { 1472 if (dev->flags & IFF_UP) { 1473 LIST_HEAD(single); 1474 1475 list_add(&dev->close_list, &single); 1476 dev_close_many(&single, true); 1477 list_del(&single); 1478 } 1479 return 0; 1480 } 1481 EXPORT_SYMBOL(dev_close); 1482 1483 1484 /** 1485 * dev_disable_lro - disable Large Receive Offload on a device 1486 * @dev: device 1487 * 1488 * Disable Large Receive Offload (LRO) on a net device. Must be 1489 * called under RTNL. This is needed if received packets may be 1490 * forwarded to another interface. 1491 */ 1492 void dev_disable_lro(struct net_device *dev) 1493 { 1494 struct net_device *lower_dev; 1495 struct list_head *iter; 1496 1497 dev->wanted_features &= ~NETIF_F_LRO; 1498 netdev_update_features(dev); 1499 1500 if (unlikely(dev->features & NETIF_F_LRO)) 1501 netdev_WARN(dev, "failed to disable LRO!\n"); 1502 1503 netdev_for_each_lower_dev(dev, lower_dev, iter) 1504 dev_disable_lro(lower_dev); 1505 } 1506 EXPORT_SYMBOL(dev_disable_lro); 1507 1508 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1509 struct net_device *dev) 1510 { 1511 struct netdev_notifier_info info; 1512 1513 netdev_notifier_info_init(&info, dev); 1514 return nb->notifier_call(nb, val, &info); 1515 } 1516 1517 static int dev_boot_phase = 1; 1518 1519 /** 1520 * register_netdevice_notifier - register a network notifier block 1521 * @nb: notifier 1522 * 1523 * Register a notifier to be called when network device events occur. 1524 * The notifier passed is linked into the kernel structures and must 1525 * not be reused until it has been unregistered. A negative errno code 1526 * is returned on a failure. 1527 * 1528 * When registered all registration and up events are replayed 1529 * to the new notifier to allow device to have a race free 1530 * view of the network device list. 1531 */ 1532 1533 int register_netdevice_notifier(struct notifier_block *nb) 1534 { 1535 struct net_device *dev; 1536 struct net_device *last; 1537 struct net *net; 1538 int err; 1539 1540 rtnl_lock(); 1541 err = raw_notifier_chain_register(&netdev_chain, nb); 1542 if (err) 1543 goto unlock; 1544 if (dev_boot_phase) 1545 goto unlock; 1546 for_each_net(net) { 1547 for_each_netdev(net, dev) { 1548 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1549 err = notifier_to_errno(err); 1550 if (err) 1551 goto rollback; 1552 1553 if (!(dev->flags & IFF_UP)) 1554 continue; 1555 1556 call_netdevice_notifier(nb, NETDEV_UP, dev); 1557 } 1558 } 1559 1560 unlock: 1561 rtnl_unlock(); 1562 return err; 1563 1564 rollback: 1565 last = dev; 1566 for_each_net(net) { 1567 for_each_netdev(net, dev) { 1568 if (dev == last) 1569 goto outroll; 1570 1571 if (dev->flags & IFF_UP) { 1572 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1573 dev); 1574 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1575 } 1576 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1577 } 1578 } 1579 1580 outroll: 1581 raw_notifier_chain_unregister(&netdev_chain, nb); 1582 goto unlock; 1583 } 1584 EXPORT_SYMBOL(register_netdevice_notifier); 1585 1586 /** 1587 * unregister_netdevice_notifier - unregister a network notifier block 1588 * @nb: notifier 1589 * 1590 * Unregister a notifier previously registered by 1591 * register_netdevice_notifier(). The notifier is unlinked into the 1592 * kernel structures and may then be reused. A negative errno code 1593 * is returned on a failure. 1594 * 1595 * After unregistering unregister and down device events are synthesized 1596 * for all devices on the device list to the removed notifier to remove 1597 * the need for special case cleanup code. 1598 */ 1599 1600 int unregister_netdevice_notifier(struct notifier_block *nb) 1601 { 1602 struct net_device *dev; 1603 struct net *net; 1604 int err; 1605 1606 rtnl_lock(); 1607 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1608 if (err) 1609 goto unlock; 1610 1611 for_each_net(net) { 1612 for_each_netdev(net, dev) { 1613 if (dev->flags & IFF_UP) { 1614 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1615 dev); 1616 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1617 } 1618 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1619 } 1620 } 1621 unlock: 1622 rtnl_unlock(); 1623 return err; 1624 } 1625 EXPORT_SYMBOL(unregister_netdevice_notifier); 1626 1627 /** 1628 * call_netdevice_notifiers_info - call all network notifier blocks 1629 * @val: value passed unmodified to notifier function 1630 * @dev: net_device pointer passed unmodified to notifier function 1631 * @info: notifier information data 1632 * 1633 * Call all network notifier blocks. Parameters and return value 1634 * are as for raw_notifier_call_chain(). 1635 */ 1636 1637 static int call_netdevice_notifiers_info(unsigned long val, 1638 struct net_device *dev, 1639 struct netdev_notifier_info *info) 1640 { 1641 ASSERT_RTNL(); 1642 netdev_notifier_info_init(info, dev); 1643 return raw_notifier_call_chain(&netdev_chain, val, info); 1644 } 1645 1646 /** 1647 * call_netdevice_notifiers - call all network notifier blocks 1648 * @val: value passed unmodified to notifier function 1649 * @dev: net_device pointer passed unmodified to notifier function 1650 * 1651 * Call all network notifier blocks. Parameters and return value 1652 * are as for raw_notifier_call_chain(). 1653 */ 1654 1655 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1656 { 1657 struct netdev_notifier_info info; 1658 1659 return call_netdevice_notifiers_info(val, dev, &info); 1660 } 1661 EXPORT_SYMBOL(call_netdevice_notifiers); 1662 1663 #ifdef CONFIG_NET_INGRESS 1664 static struct static_key ingress_needed __read_mostly; 1665 1666 void net_inc_ingress_queue(void) 1667 { 1668 static_key_slow_inc(&ingress_needed); 1669 } 1670 EXPORT_SYMBOL_GPL(net_inc_ingress_queue); 1671 1672 void net_dec_ingress_queue(void) 1673 { 1674 static_key_slow_dec(&ingress_needed); 1675 } 1676 EXPORT_SYMBOL_GPL(net_dec_ingress_queue); 1677 #endif 1678 1679 static struct static_key netstamp_needed __read_mostly; 1680 #ifdef HAVE_JUMP_LABEL 1681 /* We are not allowed to call static_key_slow_dec() from irq context 1682 * If net_disable_timestamp() is called from irq context, defer the 1683 * static_key_slow_dec() calls. 1684 */ 1685 static atomic_t netstamp_needed_deferred; 1686 #endif 1687 1688 void net_enable_timestamp(void) 1689 { 1690 #ifdef HAVE_JUMP_LABEL 1691 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1692 1693 if (deferred) { 1694 while (--deferred) 1695 static_key_slow_dec(&netstamp_needed); 1696 return; 1697 } 1698 #endif 1699 static_key_slow_inc(&netstamp_needed); 1700 } 1701 EXPORT_SYMBOL(net_enable_timestamp); 1702 1703 void net_disable_timestamp(void) 1704 { 1705 #ifdef HAVE_JUMP_LABEL 1706 if (in_interrupt()) { 1707 atomic_inc(&netstamp_needed_deferred); 1708 return; 1709 } 1710 #endif 1711 static_key_slow_dec(&netstamp_needed); 1712 } 1713 EXPORT_SYMBOL(net_disable_timestamp); 1714 1715 static inline void net_timestamp_set(struct sk_buff *skb) 1716 { 1717 skb->tstamp.tv64 = 0; 1718 if (static_key_false(&netstamp_needed)) 1719 __net_timestamp(skb); 1720 } 1721 1722 #define net_timestamp_check(COND, SKB) \ 1723 if (static_key_false(&netstamp_needed)) { \ 1724 if ((COND) && !(SKB)->tstamp.tv64) \ 1725 __net_timestamp(SKB); \ 1726 } \ 1727 1728 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) 1729 { 1730 unsigned int len; 1731 1732 if (!(dev->flags & IFF_UP)) 1733 return false; 1734 1735 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1736 if (skb->len <= len) 1737 return true; 1738 1739 /* if TSO is enabled, we don't care about the length as the packet 1740 * could be forwarded without being segmented before 1741 */ 1742 if (skb_is_gso(skb)) 1743 return true; 1744 1745 return false; 1746 } 1747 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1748 1749 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1750 { 1751 if (skb_orphan_frags(skb, GFP_ATOMIC) || 1752 unlikely(!is_skb_forwardable(dev, skb))) { 1753 atomic_long_inc(&dev->rx_dropped); 1754 kfree_skb(skb); 1755 return NET_RX_DROP; 1756 } 1757 1758 skb_scrub_packet(skb, true); 1759 skb->priority = 0; 1760 skb->protocol = eth_type_trans(skb, dev); 1761 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1762 1763 return 0; 1764 } 1765 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1766 1767 /** 1768 * dev_forward_skb - loopback an skb to another netif 1769 * 1770 * @dev: destination network device 1771 * @skb: buffer to forward 1772 * 1773 * return values: 1774 * NET_RX_SUCCESS (no congestion) 1775 * NET_RX_DROP (packet was dropped, but freed) 1776 * 1777 * dev_forward_skb can be used for injecting an skb from the 1778 * start_xmit function of one device into the receive queue 1779 * of another device. 1780 * 1781 * The receiving device may be in another namespace, so 1782 * we have to clear all information in the skb that could 1783 * impact namespace isolation. 1784 */ 1785 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1786 { 1787 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1788 } 1789 EXPORT_SYMBOL_GPL(dev_forward_skb); 1790 1791 static inline int deliver_skb(struct sk_buff *skb, 1792 struct packet_type *pt_prev, 1793 struct net_device *orig_dev) 1794 { 1795 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1796 return -ENOMEM; 1797 atomic_inc(&skb->users); 1798 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1799 } 1800 1801 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1802 struct packet_type **pt, 1803 struct net_device *orig_dev, 1804 __be16 type, 1805 struct list_head *ptype_list) 1806 { 1807 struct packet_type *ptype, *pt_prev = *pt; 1808 1809 list_for_each_entry_rcu(ptype, ptype_list, list) { 1810 if (ptype->type != type) 1811 continue; 1812 if (pt_prev) 1813 deliver_skb(skb, pt_prev, orig_dev); 1814 pt_prev = ptype; 1815 } 1816 *pt = pt_prev; 1817 } 1818 1819 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1820 { 1821 if (!ptype->af_packet_priv || !skb->sk) 1822 return false; 1823 1824 if (ptype->id_match) 1825 return ptype->id_match(ptype, skb->sk); 1826 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1827 return true; 1828 1829 return false; 1830 } 1831 1832 /* 1833 * Support routine. Sends outgoing frames to any network 1834 * taps currently in use. 1835 */ 1836 1837 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1838 { 1839 struct packet_type *ptype; 1840 struct sk_buff *skb2 = NULL; 1841 struct packet_type *pt_prev = NULL; 1842 struct list_head *ptype_list = &ptype_all; 1843 1844 rcu_read_lock(); 1845 again: 1846 list_for_each_entry_rcu(ptype, ptype_list, list) { 1847 /* Never send packets back to the socket 1848 * they originated from - MvS (miquels@drinkel.ow.org) 1849 */ 1850 if (skb_loop_sk(ptype, skb)) 1851 continue; 1852 1853 if (pt_prev) { 1854 deliver_skb(skb2, pt_prev, skb->dev); 1855 pt_prev = ptype; 1856 continue; 1857 } 1858 1859 /* need to clone skb, done only once */ 1860 skb2 = skb_clone(skb, GFP_ATOMIC); 1861 if (!skb2) 1862 goto out_unlock; 1863 1864 net_timestamp_set(skb2); 1865 1866 /* skb->nh should be correctly 1867 * set by sender, so that the second statement is 1868 * just protection against buggy protocols. 1869 */ 1870 skb_reset_mac_header(skb2); 1871 1872 if (skb_network_header(skb2) < skb2->data || 1873 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1874 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1875 ntohs(skb2->protocol), 1876 dev->name); 1877 skb_reset_network_header(skb2); 1878 } 1879 1880 skb2->transport_header = skb2->network_header; 1881 skb2->pkt_type = PACKET_OUTGOING; 1882 pt_prev = ptype; 1883 } 1884 1885 if (ptype_list == &ptype_all) { 1886 ptype_list = &dev->ptype_all; 1887 goto again; 1888 } 1889 out_unlock: 1890 if (pt_prev) 1891 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1892 rcu_read_unlock(); 1893 } 1894 1895 /** 1896 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1897 * @dev: Network device 1898 * @txq: number of queues available 1899 * 1900 * If real_num_tx_queues is changed the tc mappings may no longer be 1901 * valid. To resolve this verify the tc mapping remains valid and if 1902 * not NULL the mapping. With no priorities mapping to this 1903 * offset/count pair it will no longer be used. In the worst case TC0 1904 * is invalid nothing can be done so disable priority mappings. If is 1905 * expected that drivers will fix this mapping if they can before 1906 * calling netif_set_real_num_tx_queues. 1907 */ 1908 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1909 { 1910 int i; 1911 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1912 1913 /* If TC0 is invalidated disable TC mapping */ 1914 if (tc->offset + tc->count > txq) { 1915 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1916 dev->num_tc = 0; 1917 return; 1918 } 1919 1920 /* Invalidated prio to tc mappings set to TC0 */ 1921 for (i = 1; i < TC_BITMASK + 1; i++) { 1922 int q = netdev_get_prio_tc_map(dev, i); 1923 1924 tc = &dev->tc_to_txq[q]; 1925 if (tc->offset + tc->count > txq) { 1926 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1927 i, q); 1928 netdev_set_prio_tc_map(dev, i, 0); 1929 } 1930 } 1931 } 1932 1933 #ifdef CONFIG_XPS 1934 static DEFINE_MUTEX(xps_map_mutex); 1935 #define xmap_dereference(P) \ 1936 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1937 1938 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1939 int cpu, u16 index) 1940 { 1941 struct xps_map *map = NULL; 1942 int pos; 1943 1944 if (dev_maps) 1945 map = xmap_dereference(dev_maps->cpu_map[cpu]); 1946 1947 for (pos = 0; map && pos < map->len; pos++) { 1948 if (map->queues[pos] == index) { 1949 if (map->len > 1) { 1950 map->queues[pos] = map->queues[--map->len]; 1951 } else { 1952 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 1953 kfree_rcu(map, rcu); 1954 map = NULL; 1955 } 1956 break; 1957 } 1958 } 1959 1960 return map; 1961 } 1962 1963 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 1964 { 1965 struct xps_dev_maps *dev_maps; 1966 int cpu, i; 1967 bool active = false; 1968 1969 mutex_lock(&xps_map_mutex); 1970 dev_maps = xmap_dereference(dev->xps_maps); 1971 1972 if (!dev_maps) 1973 goto out_no_maps; 1974 1975 for_each_possible_cpu(cpu) { 1976 for (i = index; i < dev->num_tx_queues; i++) { 1977 if (!remove_xps_queue(dev_maps, cpu, i)) 1978 break; 1979 } 1980 if (i == dev->num_tx_queues) 1981 active = true; 1982 } 1983 1984 if (!active) { 1985 RCU_INIT_POINTER(dev->xps_maps, NULL); 1986 kfree_rcu(dev_maps, rcu); 1987 } 1988 1989 for (i = index; i < dev->num_tx_queues; i++) 1990 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 1991 NUMA_NO_NODE); 1992 1993 out_no_maps: 1994 mutex_unlock(&xps_map_mutex); 1995 } 1996 1997 static struct xps_map *expand_xps_map(struct xps_map *map, 1998 int cpu, u16 index) 1999 { 2000 struct xps_map *new_map; 2001 int alloc_len = XPS_MIN_MAP_ALLOC; 2002 int i, pos; 2003 2004 for (pos = 0; map && pos < map->len; pos++) { 2005 if (map->queues[pos] != index) 2006 continue; 2007 return map; 2008 } 2009 2010 /* Need to add queue to this CPU's existing map */ 2011 if (map) { 2012 if (pos < map->alloc_len) 2013 return map; 2014 2015 alloc_len = map->alloc_len * 2; 2016 } 2017 2018 /* Need to allocate new map to store queue on this CPU's map */ 2019 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 2020 cpu_to_node(cpu)); 2021 if (!new_map) 2022 return NULL; 2023 2024 for (i = 0; i < pos; i++) 2025 new_map->queues[i] = map->queues[i]; 2026 new_map->alloc_len = alloc_len; 2027 new_map->len = pos; 2028 2029 return new_map; 2030 } 2031 2032 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 2033 u16 index) 2034 { 2035 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2036 struct xps_map *map, *new_map; 2037 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); 2038 int cpu, numa_node_id = -2; 2039 bool active = false; 2040 2041 mutex_lock(&xps_map_mutex); 2042 2043 dev_maps = xmap_dereference(dev->xps_maps); 2044 2045 /* allocate memory for queue storage */ 2046 for_each_online_cpu(cpu) { 2047 if (!cpumask_test_cpu(cpu, mask)) 2048 continue; 2049 2050 if (!new_dev_maps) 2051 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2052 if (!new_dev_maps) { 2053 mutex_unlock(&xps_map_mutex); 2054 return -ENOMEM; 2055 } 2056 2057 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2058 NULL; 2059 2060 map = expand_xps_map(map, cpu, index); 2061 if (!map) 2062 goto error; 2063 2064 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2065 } 2066 2067 if (!new_dev_maps) 2068 goto out_no_new_maps; 2069 2070 for_each_possible_cpu(cpu) { 2071 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2072 /* add queue to CPU maps */ 2073 int pos = 0; 2074 2075 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2076 while ((pos < map->len) && (map->queues[pos] != index)) 2077 pos++; 2078 2079 if (pos == map->len) 2080 map->queues[map->len++] = index; 2081 #ifdef CONFIG_NUMA 2082 if (numa_node_id == -2) 2083 numa_node_id = cpu_to_node(cpu); 2084 else if (numa_node_id != cpu_to_node(cpu)) 2085 numa_node_id = -1; 2086 #endif 2087 } else if (dev_maps) { 2088 /* fill in the new device map from the old device map */ 2089 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2090 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2091 } 2092 2093 } 2094 2095 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2096 2097 /* Cleanup old maps */ 2098 if (dev_maps) { 2099 for_each_possible_cpu(cpu) { 2100 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2101 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2102 if (map && map != new_map) 2103 kfree_rcu(map, rcu); 2104 } 2105 2106 kfree_rcu(dev_maps, rcu); 2107 } 2108 2109 dev_maps = new_dev_maps; 2110 active = true; 2111 2112 out_no_new_maps: 2113 /* update Tx queue numa node */ 2114 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2115 (numa_node_id >= 0) ? numa_node_id : 2116 NUMA_NO_NODE); 2117 2118 if (!dev_maps) 2119 goto out_no_maps; 2120 2121 /* removes queue from unused CPUs */ 2122 for_each_possible_cpu(cpu) { 2123 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2124 continue; 2125 2126 if (remove_xps_queue(dev_maps, cpu, index)) 2127 active = true; 2128 } 2129 2130 /* free map if not active */ 2131 if (!active) { 2132 RCU_INIT_POINTER(dev->xps_maps, NULL); 2133 kfree_rcu(dev_maps, rcu); 2134 } 2135 2136 out_no_maps: 2137 mutex_unlock(&xps_map_mutex); 2138 2139 return 0; 2140 error: 2141 /* remove any maps that we added */ 2142 for_each_possible_cpu(cpu) { 2143 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2144 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2145 NULL; 2146 if (new_map && new_map != map) 2147 kfree(new_map); 2148 } 2149 2150 mutex_unlock(&xps_map_mutex); 2151 2152 kfree(new_dev_maps); 2153 return -ENOMEM; 2154 } 2155 EXPORT_SYMBOL(netif_set_xps_queue); 2156 2157 #endif 2158 /* 2159 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2160 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2161 */ 2162 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2163 { 2164 int rc; 2165 2166 if (txq < 1 || txq > dev->num_tx_queues) 2167 return -EINVAL; 2168 2169 if (dev->reg_state == NETREG_REGISTERED || 2170 dev->reg_state == NETREG_UNREGISTERING) { 2171 ASSERT_RTNL(); 2172 2173 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2174 txq); 2175 if (rc) 2176 return rc; 2177 2178 if (dev->num_tc) 2179 netif_setup_tc(dev, txq); 2180 2181 if (txq < dev->real_num_tx_queues) { 2182 qdisc_reset_all_tx_gt(dev, txq); 2183 #ifdef CONFIG_XPS 2184 netif_reset_xps_queues_gt(dev, txq); 2185 #endif 2186 } 2187 } 2188 2189 dev->real_num_tx_queues = txq; 2190 return 0; 2191 } 2192 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2193 2194 #ifdef CONFIG_SYSFS 2195 /** 2196 * netif_set_real_num_rx_queues - set actual number of RX queues used 2197 * @dev: Network device 2198 * @rxq: Actual number of RX queues 2199 * 2200 * This must be called either with the rtnl_lock held or before 2201 * registration of the net device. Returns 0 on success, or a 2202 * negative error code. If called before registration, it always 2203 * succeeds. 2204 */ 2205 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2206 { 2207 int rc; 2208 2209 if (rxq < 1 || rxq > dev->num_rx_queues) 2210 return -EINVAL; 2211 2212 if (dev->reg_state == NETREG_REGISTERED) { 2213 ASSERT_RTNL(); 2214 2215 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2216 rxq); 2217 if (rc) 2218 return rc; 2219 } 2220 2221 dev->real_num_rx_queues = rxq; 2222 return 0; 2223 } 2224 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2225 #endif 2226 2227 /** 2228 * netif_get_num_default_rss_queues - default number of RSS queues 2229 * 2230 * This routine should set an upper limit on the number of RSS queues 2231 * used by default by multiqueue devices. 2232 */ 2233 int netif_get_num_default_rss_queues(void) 2234 { 2235 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2236 } 2237 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2238 2239 static inline void __netif_reschedule(struct Qdisc *q) 2240 { 2241 struct softnet_data *sd; 2242 unsigned long flags; 2243 2244 local_irq_save(flags); 2245 sd = this_cpu_ptr(&softnet_data); 2246 q->next_sched = NULL; 2247 *sd->output_queue_tailp = q; 2248 sd->output_queue_tailp = &q->next_sched; 2249 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2250 local_irq_restore(flags); 2251 } 2252 2253 void __netif_schedule(struct Qdisc *q) 2254 { 2255 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2256 __netif_reschedule(q); 2257 } 2258 EXPORT_SYMBOL(__netif_schedule); 2259 2260 struct dev_kfree_skb_cb { 2261 enum skb_free_reason reason; 2262 }; 2263 2264 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2265 { 2266 return (struct dev_kfree_skb_cb *)skb->cb; 2267 } 2268 2269 void netif_schedule_queue(struct netdev_queue *txq) 2270 { 2271 rcu_read_lock(); 2272 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2273 struct Qdisc *q = rcu_dereference(txq->qdisc); 2274 2275 __netif_schedule(q); 2276 } 2277 rcu_read_unlock(); 2278 } 2279 EXPORT_SYMBOL(netif_schedule_queue); 2280 2281 /** 2282 * netif_wake_subqueue - allow sending packets on subqueue 2283 * @dev: network device 2284 * @queue_index: sub queue index 2285 * 2286 * Resume individual transmit queue of a device with multiple transmit queues. 2287 */ 2288 void netif_wake_subqueue(struct net_device *dev, u16 queue_index) 2289 { 2290 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); 2291 2292 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { 2293 struct Qdisc *q; 2294 2295 rcu_read_lock(); 2296 q = rcu_dereference(txq->qdisc); 2297 __netif_schedule(q); 2298 rcu_read_unlock(); 2299 } 2300 } 2301 EXPORT_SYMBOL(netif_wake_subqueue); 2302 2303 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2304 { 2305 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2306 struct Qdisc *q; 2307 2308 rcu_read_lock(); 2309 q = rcu_dereference(dev_queue->qdisc); 2310 __netif_schedule(q); 2311 rcu_read_unlock(); 2312 } 2313 } 2314 EXPORT_SYMBOL(netif_tx_wake_queue); 2315 2316 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2317 { 2318 unsigned long flags; 2319 2320 if (likely(atomic_read(&skb->users) == 1)) { 2321 smp_rmb(); 2322 atomic_set(&skb->users, 0); 2323 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2324 return; 2325 } 2326 get_kfree_skb_cb(skb)->reason = reason; 2327 local_irq_save(flags); 2328 skb->next = __this_cpu_read(softnet_data.completion_queue); 2329 __this_cpu_write(softnet_data.completion_queue, skb); 2330 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2331 local_irq_restore(flags); 2332 } 2333 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2334 2335 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2336 { 2337 if (in_irq() || irqs_disabled()) 2338 __dev_kfree_skb_irq(skb, reason); 2339 else 2340 dev_kfree_skb(skb); 2341 } 2342 EXPORT_SYMBOL(__dev_kfree_skb_any); 2343 2344 2345 /** 2346 * netif_device_detach - mark device as removed 2347 * @dev: network device 2348 * 2349 * Mark device as removed from system and therefore no longer available. 2350 */ 2351 void netif_device_detach(struct net_device *dev) 2352 { 2353 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2354 netif_running(dev)) { 2355 netif_tx_stop_all_queues(dev); 2356 } 2357 } 2358 EXPORT_SYMBOL(netif_device_detach); 2359 2360 /** 2361 * netif_device_attach - mark device as attached 2362 * @dev: network device 2363 * 2364 * Mark device as attached from system and restart if needed. 2365 */ 2366 void netif_device_attach(struct net_device *dev) 2367 { 2368 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2369 netif_running(dev)) { 2370 netif_tx_wake_all_queues(dev); 2371 __netdev_watchdog_up(dev); 2372 } 2373 } 2374 EXPORT_SYMBOL(netif_device_attach); 2375 2376 /* 2377 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2378 * to be used as a distribution range. 2379 */ 2380 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 2381 unsigned int num_tx_queues) 2382 { 2383 u32 hash; 2384 u16 qoffset = 0; 2385 u16 qcount = num_tx_queues; 2386 2387 if (skb_rx_queue_recorded(skb)) { 2388 hash = skb_get_rx_queue(skb); 2389 while (unlikely(hash >= num_tx_queues)) 2390 hash -= num_tx_queues; 2391 return hash; 2392 } 2393 2394 if (dev->num_tc) { 2395 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2396 qoffset = dev->tc_to_txq[tc].offset; 2397 qcount = dev->tc_to_txq[tc].count; 2398 } 2399 2400 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2401 } 2402 EXPORT_SYMBOL(__skb_tx_hash); 2403 2404 static void skb_warn_bad_offload(const struct sk_buff *skb) 2405 { 2406 static const netdev_features_t null_features = 0; 2407 struct net_device *dev = skb->dev; 2408 const char *name = ""; 2409 2410 if (!net_ratelimit()) 2411 return; 2412 2413 if (dev) { 2414 if (dev->dev.parent) 2415 name = dev_driver_string(dev->dev.parent); 2416 else 2417 name = netdev_name(dev); 2418 } 2419 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2420 "gso_type=%d ip_summed=%d\n", 2421 name, dev ? &dev->features : &null_features, 2422 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2423 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2424 skb_shinfo(skb)->gso_type, skb->ip_summed); 2425 } 2426 2427 /* 2428 * Invalidate hardware checksum when packet is to be mangled, and 2429 * complete checksum manually on outgoing path. 2430 */ 2431 int skb_checksum_help(struct sk_buff *skb) 2432 { 2433 __wsum csum; 2434 int ret = 0, offset; 2435 2436 if (skb->ip_summed == CHECKSUM_COMPLETE) 2437 goto out_set_summed; 2438 2439 if (unlikely(skb_shinfo(skb)->gso_size)) { 2440 skb_warn_bad_offload(skb); 2441 return -EINVAL; 2442 } 2443 2444 /* Before computing a checksum, we should make sure no frag could 2445 * be modified by an external entity : checksum could be wrong. 2446 */ 2447 if (skb_has_shared_frag(skb)) { 2448 ret = __skb_linearize(skb); 2449 if (ret) 2450 goto out; 2451 } 2452 2453 offset = skb_checksum_start_offset(skb); 2454 BUG_ON(offset >= skb_headlen(skb)); 2455 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2456 2457 offset += skb->csum_offset; 2458 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2459 2460 if (skb_cloned(skb) && 2461 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2462 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2463 if (ret) 2464 goto out; 2465 } 2466 2467 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2468 out_set_summed: 2469 skb->ip_summed = CHECKSUM_NONE; 2470 out: 2471 return ret; 2472 } 2473 EXPORT_SYMBOL(skb_checksum_help); 2474 2475 /* skb_csum_offload_check - Driver helper function to determine if a device 2476 * with limited checksum offload capabilities is able to offload the checksum 2477 * for a given packet. 2478 * 2479 * Arguments: 2480 * skb - sk_buff for the packet in question 2481 * spec - contains the description of what device can offload 2482 * csum_encapped - returns true if the checksum being offloaded is 2483 * encpasulated. That is it is checksum for the transport header 2484 * in the inner headers. 2485 * checksum_help - when set indicates that helper function should 2486 * call skb_checksum_help if offload checks fail 2487 * 2488 * Returns: 2489 * true: Packet has passed the checksum checks and should be offloadable to 2490 * the device (a driver may still need to check for additional 2491 * restrictions of its device) 2492 * false: Checksum is not offloadable. If checksum_help was set then 2493 * skb_checksum_help was called to resolve checksum for non-GSO 2494 * packets and when IP protocol is not SCTP 2495 */ 2496 bool __skb_csum_offload_chk(struct sk_buff *skb, 2497 const struct skb_csum_offl_spec *spec, 2498 bool *csum_encapped, 2499 bool csum_help) 2500 { 2501 struct iphdr *iph; 2502 struct ipv6hdr *ipv6; 2503 void *nhdr; 2504 int protocol; 2505 u8 ip_proto; 2506 2507 if (skb->protocol == htons(ETH_P_8021Q) || 2508 skb->protocol == htons(ETH_P_8021AD)) { 2509 if (!spec->vlan_okay) 2510 goto need_help; 2511 } 2512 2513 /* We check whether the checksum refers to a transport layer checksum in 2514 * the outermost header or an encapsulated transport layer checksum that 2515 * corresponds to the inner headers of the skb. If the checksum is for 2516 * something else in the packet we need help. 2517 */ 2518 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { 2519 /* Non-encapsulated checksum */ 2520 protocol = eproto_to_ipproto(vlan_get_protocol(skb)); 2521 nhdr = skb_network_header(skb); 2522 *csum_encapped = false; 2523 if (spec->no_not_encapped) 2524 goto need_help; 2525 } else if (skb->encapsulation && spec->encap_okay && 2526 skb_checksum_start_offset(skb) == 2527 skb_inner_transport_offset(skb)) { 2528 /* Encapsulated checksum */ 2529 *csum_encapped = true; 2530 switch (skb->inner_protocol_type) { 2531 case ENCAP_TYPE_ETHER: 2532 protocol = eproto_to_ipproto(skb->inner_protocol); 2533 break; 2534 case ENCAP_TYPE_IPPROTO: 2535 protocol = skb->inner_protocol; 2536 break; 2537 } 2538 nhdr = skb_inner_network_header(skb); 2539 } else { 2540 goto need_help; 2541 } 2542 2543 switch (protocol) { 2544 case IPPROTO_IP: 2545 if (!spec->ipv4_okay) 2546 goto need_help; 2547 iph = nhdr; 2548 ip_proto = iph->protocol; 2549 if (iph->ihl != 5 && !spec->ip_options_okay) 2550 goto need_help; 2551 break; 2552 case IPPROTO_IPV6: 2553 if (!spec->ipv6_okay) 2554 goto need_help; 2555 if (spec->no_encapped_ipv6 && *csum_encapped) 2556 goto need_help; 2557 ipv6 = nhdr; 2558 nhdr += sizeof(*ipv6); 2559 ip_proto = ipv6->nexthdr; 2560 break; 2561 default: 2562 goto need_help; 2563 } 2564 2565 ip_proto_again: 2566 switch (ip_proto) { 2567 case IPPROTO_TCP: 2568 if (!spec->tcp_okay || 2569 skb->csum_offset != offsetof(struct tcphdr, check)) 2570 goto need_help; 2571 break; 2572 case IPPROTO_UDP: 2573 if (!spec->udp_okay || 2574 skb->csum_offset != offsetof(struct udphdr, check)) 2575 goto need_help; 2576 break; 2577 case IPPROTO_SCTP: 2578 if (!spec->sctp_okay || 2579 skb->csum_offset != offsetof(struct sctphdr, checksum)) 2580 goto cant_help; 2581 break; 2582 case NEXTHDR_HOP: 2583 case NEXTHDR_ROUTING: 2584 case NEXTHDR_DEST: { 2585 u8 *opthdr = nhdr; 2586 2587 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) 2588 goto need_help; 2589 2590 ip_proto = opthdr[0]; 2591 nhdr += (opthdr[1] + 1) << 3; 2592 2593 goto ip_proto_again; 2594 } 2595 default: 2596 goto need_help; 2597 } 2598 2599 /* Passed the tests for offloading checksum */ 2600 return true; 2601 2602 need_help: 2603 if (csum_help && !skb_shinfo(skb)->gso_size) 2604 skb_checksum_help(skb); 2605 cant_help: 2606 return false; 2607 } 2608 EXPORT_SYMBOL(__skb_csum_offload_chk); 2609 2610 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2611 { 2612 __be16 type = skb->protocol; 2613 2614 /* Tunnel gso handlers can set protocol to ethernet. */ 2615 if (type == htons(ETH_P_TEB)) { 2616 struct ethhdr *eth; 2617 2618 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2619 return 0; 2620 2621 eth = (struct ethhdr *)skb_mac_header(skb); 2622 type = eth->h_proto; 2623 } 2624 2625 return __vlan_get_protocol(skb, type, depth); 2626 } 2627 2628 /** 2629 * skb_mac_gso_segment - mac layer segmentation handler. 2630 * @skb: buffer to segment 2631 * @features: features for the output path (see dev->features) 2632 */ 2633 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2634 netdev_features_t features) 2635 { 2636 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2637 struct packet_offload *ptype; 2638 int vlan_depth = skb->mac_len; 2639 __be16 type = skb_network_protocol(skb, &vlan_depth); 2640 2641 if (unlikely(!type)) 2642 return ERR_PTR(-EINVAL); 2643 2644 __skb_pull(skb, vlan_depth); 2645 2646 rcu_read_lock(); 2647 list_for_each_entry_rcu(ptype, &offload_base, list) { 2648 if (ptype->type == type && ptype->callbacks.gso_segment) { 2649 segs = ptype->callbacks.gso_segment(skb, features); 2650 break; 2651 } 2652 } 2653 rcu_read_unlock(); 2654 2655 __skb_push(skb, skb->data - skb_mac_header(skb)); 2656 2657 return segs; 2658 } 2659 EXPORT_SYMBOL(skb_mac_gso_segment); 2660 2661 2662 /* openvswitch calls this on rx path, so we need a different check. 2663 */ 2664 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2665 { 2666 if (tx_path) 2667 return skb->ip_summed != CHECKSUM_PARTIAL; 2668 else 2669 return skb->ip_summed == CHECKSUM_NONE; 2670 } 2671 2672 /** 2673 * __skb_gso_segment - Perform segmentation on skb. 2674 * @skb: buffer to segment 2675 * @features: features for the output path (see dev->features) 2676 * @tx_path: whether it is called in TX path 2677 * 2678 * This function segments the given skb and returns a list of segments. 2679 * 2680 * It may return NULL if the skb requires no segmentation. This is 2681 * only possible when GSO is used for verifying header integrity. 2682 */ 2683 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2684 netdev_features_t features, bool tx_path) 2685 { 2686 if (unlikely(skb_needs_check(skb, tx_path))) { 2687 int err; 2688 2689 skb_warn_bad_offload(skb); 2690 2691 err = skb_cow_head(skb, 0); 2692 if (err < 0) 2693 return ERR_PTR(err); 2694 } 2695 2696 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2697 SKB_GSO_CB(skb)->encap_level = 0; 2698 2699 skb_reset_mac_header(skb); 2700 skb_reset_mac_len(skb); 2701 2702 return skb_mac_gso_segment(skb, features); 2703 } 2704 EXPORT_SYMBOL(__skb_gso_segment); 2705 2706 /* Take action when hardware reception checksum errors are detected. */ 2707 #ifdef CONFIG_BUG 2708 void netdev_rx_csum_fault(struct net_device *dev) 2709 { 2710 if (net_ratelimit()) { 2711 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2712 dump_stack(); 2713 } 2714 } 2715 EXPORT_SYMBOL(netdev_rx_csum_fault); 2716 #endif 2717 2718 /* Actually, we should eliminate this check as soon as we know, that: 2719 * 1. IOMMU is present and allows to map all the memory. 2720 * 2. No high memory really exists on this machine. 2721 */ 2722 2723 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2724 { 2725 #ifdef CONFIG_HIGHMEM 2726 int i; 2727 if (!(dev->features & NETIF_F_HIGHDMA)) { 2728 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2729 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2730 if (PageHighMem(skb_frag_page(frag))) 2731 return 1; 2732 } 2733 } 2734 2735 if (PCI_DMA_BUS_IS_PHYS) { 2736 struct device *pdev = dev->dev.parent; 2737 2738 if (!pdev) 2739 return 0; 2740 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2741 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2742 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2743 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2744 return 1; 2745 } 2746 } 2747 #endif 2748 return 0; 2749 } 2750 2751 /* If MPLS offload request, verify we are testing hardware MPLS features 2752 * instead of standard features for the netdev. 2753 */ 2754 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2755 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2756 netdev_features_t features, 2757 __be16 type) 2758 { 2759 if (eth_p_mpls(type)) 2760 features &= skb->dev->mpls_features; 2761 2762 return features; 2763 } 2764 #else 2765 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2766 netdev_features_t features, 2767 __be16 type) 2768 { 2769 return features; 2770 } 2771 #endif 2772 2773 static netdev_features_t harmonize_features(struct sk_buff *skb, 2774 netdev_features_t features) 2775 { 2776 int tmp; 2777 __be16 type; 2778 2779 type = skb_network_protocol(skb, &tmp); 2780 features = net_mpls_features(skb, features, type); 2781 2782 if (skb->ip_summed != CHECKSUM_NONE && 2783 !can_checksum_protocol(features, type)) { 2784 features &= ~NETIF_F_CSUM_MASK; 2785 } else if (illegal_highdma(skb->dev, skb)) { 2786 features &= ~NETIF_F_SG; 2787 } 2788 2789 return features; 2790 } 2791 2792 netdev_features_t passthru_features_check(struct sk_buff *skb, 2793 struct net_device *dev, 2794 netdev_features_t features) 2795 { 2796 return features; 2797 } 2798 EXPORT_SYMBOL(passthru_features_check); 2799 2800 static netdev_features_t dflt_features_check(const struct sk_buff *skb, 2801 struct net_device *dev, 2802 netdev_features_t features) 2803 { 2804 return vlan_features_check(skb, features); 2805 } 2806 2807 netdev_features_t netif_skb_features(struct sk_buff *skb) 2808 { 2809 struct net_device *dev = skb->dev; 2810 netdev_features_t features = dev->features; 2811 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2812 2813 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) 2814 features &= ~NETIF_F_GSO_MASK; 2815 2816 /* If encapsulation offload request, verify we are testing 2817 * hardware encapsulation features instead of standard 2818 * features for the netdev 2819 */ 2820 if (skb->encapsulation) 2821 features &= dev->hw_enc_features; 2822 2823 if (skb_vlan_tagged(skb)) 2824 features = netdev_intersect_features(features, 2825 dev->vlan_features | 2826 NETIF_F_HW_VLAN_CTAG_TX | 2827 NETIF_F_HW_VLAN_STAG_TX); 2828 2829 if (dev->netdev_ops->ndo_features_check) 2830 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2831 features); 2832 else 2833 features &= dflt_features_check(skb, dev, features); 2834 2835 return harmonize_features(skb, features); 2836 } 2837 EXPORT_SYMBOL(netif_skb_features); 2838 2839 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2840 struct netdev_queue *txq, bool more) 2841 { 2842 unsigned int len; 2843 int rc; 2844 2845 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2846 dev_queue_xmit_nit(skb, dev); 2847 2848 len = skb->len; 2849 trace_net_dev_start_xmit(skb, dev); 2850 rc = netdev_start_xmit(skb, dev, txq, more); 2851 trace_net_dev_xmit(skb, rc, dev, len); 2852 2853 return rc; 2854 } 2855 2856 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2857 struct netdev_queue *txq, int *ret) 2858 { 2859 struct sk_buff *skb = first; 2860 int rc = NETDEV_TX_OK; 2861 2862 while (skb) { 2863 struct sk_buff *next = skb->next; 2864 2865 skb->next = NULL; 2866 rc = xmit_one(skb, dev, txq, next != NULL); 2867 if (unlikely(!dev_xmit_complete(rc))) { 2868 skb->next = next; 2869 goto out; 2870 } 2871 2872 skb = next; 2873 if (netif_xmit_stopped(txq) && skb) { 2874 rc = NETDEV_TX_BUSY; 2875 break; 2876 } 2877 } 2878 2879 out: 2880 *ret = rc; 2881 return skb; 2882 } 2883 2884 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2885 netdev_features_t features) 2886 { 2887 if (skb_vlan_tag_present(skb) && 2888 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2889 skb = __vlan_hwaccel_push_inside(skb); 2890 return skb; 2891 } 2892 2893 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2894 { 2895 netdev_features_t features; 2896 2897 if (skb->next) 2898 return skb; 2899 2900 features = netif_skb_features(skb); 2901 skb = validate_xmit_vlan(skb, features); 2902 if (unlikely(!skb)) 2903 goto out_null; 2904 2905 if (netif_needs_gso(skb, features)) { 2906 struct sk_buff *segs; 2907 2908 segs = skb_gso_segment(skb, features); 2909 if (IS_ERR(segs)) { 2910 goto out_kfree_skb; 2911 } else if (segs) { 2912 consume_skb(skb); 2913 skb = segs; 2914 } 2915 } else { 2916 if (skb_needs_linearize(skb, features) && 2917 __skb_linearize(skb)) 2918 goto out_kfree_skb; 2919 2920 /* If packet is not checksummed and device does not 2921 * support checksumming for this protocol, complete 2922 * checksumming here. 2923 */ 2924 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2925 if (skb->encapsulation) 2926 skb_set_inner_transport_header(skb, 2927 skb_checksum_start_offset(skb)); 2928 else 2929 skb_set_transport_header(skb, 2930 skb_checksum_start_offset(skb)); 2931 if (!(features & NETIF_F_CSUM_MASK) && 2932 skb_checksum_help(skb)) 2933 goto out_kfree_skb; 2934 } 2935 } 2936 2937 return skb; 2938 2939 out_kfree_skb: 2940 kfree_skb(skb); 2941 out_null: 2942 return NULL; 2943 } 2944 2945 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 2946 { 2947 struct sk_buff *next, *head = NULL, *tail; 2948 2949 for (; skb != NULL; skb = next) { 2950 next = skb->next; 2951 skb->next = NULL; 2952 2953 /* in case skb wont be segmented, point to itself */ 2954 skb->prev = skb; 2955 2956 skb = validate_xmit_skb(skb, dev); 2957 if (!skb) 2958 continue; 2959 2960 if (!head) 2961 head = skb; 2962 else 2963 tail->next = skb; 2964 /* If skb was segmented, skb->prev points to 2965 * the last segment. If not, it still contains skb. 2966 */ 2967 tail = skb->prev; 2968 } 2969 return head; 2970 } 2971 2972 static void qdisc_pkt_len_init(struct sk_buff *skb) 2973 { 2974 const struct skb_shared_info *shinfo = skb_shinfo(skb); 2975 2976 qdisc_skb_cb(skb)->pkt_len = skb->len; 2977 2978 /* To get more precise estimation of bytes sent on wire, 2979 * we add to pkt_len the headers size of all segments 2980 */ 2981 if (shinfo->gso_size) { 2982 unsigned int hdr_len; 2983 u16 gso_segs = shinfo->gso_segs; 2984 2985 /* mac layer + network layer */ 2986 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 2987 2988 /* + transport layer */ 2989 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 2990 hdr_len += tcp_hdrlen(skb); 2991 else 2992 hdr_len += sizeof(struct udphdr); 2993 2994 if (shinfo->gso_type & SKB_GSO_DODGY) 2995 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 2996 shinfo->gso_size); 2997 2998 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 2999 } 3000 } 3001 3002 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 3003 struct net_device *dev, 3004 struct netdev_queue *txq) 3005 { 3006 spinlock_t *root_lock = qdisc_lock(q); 3007 bool contended; 3008 int rc; 3009 3010 qdisc_pkt_len_init(skb); 3011 qdisc_calculate_pkt_len(skb, q); 3012 /* 3013 * Heuristic to force contended enqueues to serialize on a 3014 * separate lock before trying to get qdisc main lock. 3015 * This permits __QDISC___STATE_RUNNING owner to get the lock more 3016 * often and dequeue packets faster. 3017 */ 3018 contended = qdisc_is_running(q); 3019 if (unlikely(contended)) 3020 spin_lock(&q->busylock); 3021 3022 spin_lock(root_lock); 3023 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 3024 kfree_skb(skb); 3025 rc = NET_XMIT_DROP; 3026 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 3027 qdisc_run_begin(q)) { 3028 /* 3029 * This is a work-conserving queue; there are no old skbs 3030 * waiting to be sent out; and the qdisc is not running - 3031 * xmit the skb directly. 3032 */ 3033 3034 qdisc_bstats_update(q, skb); 3035 3036 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 3037 if (unlikely(contended)) { 3038 spin_unlock(&q->busylock); 3039 contended = false; 3040 } 3041 __qdisc_run(q); 3042 } else 3043 qdisc_run_end(q); 3044 3045 rc = NET_XMIT_SUCCESS; 3046 } else { 3047 rc = q->enqueue(skb, q) & NET_XMIT_MASK; 3048 if (qdisc_run_begin(q)) { 3049 if (unlikely(contended)) { 3050 spin_unlock(&q->busylock); 3051 contended = false; 3052 } 3053 __qdisc_run(q); 3054 } 3055 } 3056 spin_unlock(root_lock); 3057 if (unlikely(contended)) 3058 spin_unlock(&q->busylock); 3059 return rc; 3060 } 3061 3062 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 3063 static void skb_update_prio(struct sk_buff *skb) 3064 { 3065 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 3066 3067 if (!skb->priority && skb->sk && map) { 3068 unsigned int prioidx = 3069 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); 3070 3071 if (prioidx < map->priomap_len) 3072 skb->priority = map->priomap[prioidx]; 3073 } 3074 } 3075 #else 3076 #define skb_update_prio(skb) 3077 #endif 3078 3079 DEFINE_PER_CPU(int, xmit_recursion); 3080 EXPORT_SYMBOL(xmit_recursion); 3081 3082 #define RECURSION_LIMIT 10 3083 3084 /** 3085 * dev_loopback_xmit - loop back @skb 3086 * @net: network namespace this loopback is happening in 3087 * @sk: sk needed to be a netfilter okfn 3088 * @skb: buffer to transmit 3089 */ 3090 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 3091 { 3092 skb_reset_mac_header(skb); 3093 __skb_pull(skb, skb_network_offset(skb)); 3094 skb->pkt_type = PACKET_LOOPBACK; 3095 skb->ip_summed = CHECKSUM_UNNECESSARY; 3096 WARN_ON(!skb_dst(skb)); 3097 skb_dst_force(skb); 3098 netif_rx_ni(skb); 3099 return 0; 3100 } 3101 EXPORT_SYMBOL(dev_loopback_xmit); 3102 3103 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 3104 { 3105 #ifdef CONFIG_XPS 3106 struct xps_dev_maps *dev_maps; 3107 struct xps_map *map; 3108 int queue_index = -1; 3109 3110 rcu_read_lock(); 3111 dev_maps = rcu_dereference(dev->xps_maps); 3112 if (dev_maps) { 3113 map = rcu_dereference( 3114 dev_maps->cpu_map[skb->sender_cpu - 1]); 3115 if (map) { 3116 if (map->len == 1) 3117 queue_index = map->queues[0]; 3118 else 3119 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 3120 map->len)]; 3121 if (unlikely(queue_index >= dev->real_num_tx_queues)) 3122 queue_index = -1; 3123 } 3124 } 3125 rcu_read_unlock(); 3126 3127 return queue_index; 3128 #else 3129 return -1; 3130 #endif 3131 } 3132 3133 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 3134 { 3135 struct sock *sk = skb->sk; 3136 int queue_index = sk_tx_queue_get(sk); 3137 3138 if (queue_index < 0 || skb->ooo_okay || 3139 queue_index >= dev->real_num_tx_queues) { 3140 int new_index = get_xps_queue(dev, skb); 3141 if (new_index < 0) 3142 new_index = skb_tx_hash(dev, skb); 3143 3144 if (queue_index != new_index && sk && 3145 sk_fullsock(sk) && 3146 rcu_access_pointer(sk->sk_dst_cache)) 3147 sk_tx_queue_set(sk, new_index); 3148 3149 queue_index = new_index; 3150 } 3151 3152 return queue_index; 3153 } 3154 3155 struct netdev_queue *netdev_pick_tx(struct net_device *dev, 3156 struct sk_buff *skb, 3157 void *accel_priv) 3158 { 3159 int queue_index = 0; 3160 3161 #ifdef CONFIG_XPS 3162 u32 sender_cpu = skb->sender_cpu - 1; 3163 3164 if (sender_cpu >= (u32)NR_CPUS) 3165 skb->sender_cpu = raw_smp_processor_id() + 1; 3166 #endif 3167 3168 if (dev->real_num_tx_queues != 1) { 3169 const struct net_device_ops *ops = dev->netdev_ops; 3170 if (ops->ndo_select_queue) 3171 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3172 __netdev_pick_tx); 3173 else 3174 queue_index = __netdev_pick_tx(dev, skb); 3175 3176 if (!accel_priv) 3177 queue_index = netdev_cap_txqueue(dev, queue_index); 3178 } 3179 3180 skb_set_queue_mapping(skb, queue_index); 3181 return netdev_get_tx_queue(dev, queue_index); 3182 } 3183 3184 /** 3185 * __dev_queue_xmit - transmit a buffer 3186 * @skb: buffer to transmit 3187 * @accel_priv: private data used for L2 forwarding offload 3188 * 3189 * Queue a buffer for transmission to a network device. The caller must 3190 * have set the device and priority and built the buffer before calling 3191 * this function. The function can be called from an interrupt. 3192 * 3193 * A negative errno code is returned on a failure. A success does not 3194 * guarantee the frame will be transmitted as it may be dropped due 3195 * to congestion or traffic shaping. 3196 * 3197 * ----------------------------------------------------------------------------------- 3198 * I notice this method can also return errors from the queue disciplines, 3199 * including NET_XMIT_DROP, which is a positive value. So, errors can also 3200 * be positive. 3201 * 3202 * Regardless of the return value, the skb is consumed, so it is currently 3203 * difficult to retry a send to this method. (You can bump the ref count 3204 * before sending to hold a reference for retry if you are careful.) 3205 * 3206 * When calling this method, interrupts MUST be enabled. This is because 3207 * the BH enable code must have IRQs enabled so that it will not deadlock. 3208 * --BLG 3209 */ 3210 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 3211 { 3212 struct net_device *dev = skb->dev; 3213 struct netdev_queue *txq; 3214 struct Qdisc *q; 3215 int rc = -ENOMEM; 3216 3217 skb_reset_mac_header(skb); 3218 3219 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 3220 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 3221 3222 /* Disable soft irqs for various locks below. Also 3223 * stops preemption for RCU. 3224 */ 3225 rcu_read_lock_bh(); 3226 3227 skb_update_prio(skb); 3228 3229 /* If device/qdisc don't need skb->dst, release it right now while 3230 * its hot in this cpu cache. 3231 */ 3232 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 3233 skb_dst_drop(skb); 3234 else 3235 skb_dst_force(skb); 3236 3237 #ifdef CONFIG_NET_SWITCHDEV 3238 /* Don't forward if offload device already forwarded */ 3239 if (skb->offload_fwd_mark && 3240 skb->offload_fwd_mark == dev->offload_fwd_mark) { 3241 consume_skb(skb); 3242 rc = NET_XMIT_SUCCESS; 3243 goto out; 3244 } 3245 #endif 3246 3247 txq = netdev_pick_tx(dev, skb, accel_priv); 3248 q = rcu_dereference_bh(txq->qdisc); 3249 3250 #ifdef CONFIG_NET_CLS_ACT 3251 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 3252 #endif 3253 trace_net_dev_queue(skb); 3254 if (q->enqueue) { 3255 rc = __dev_xmit_skb(skb, q, dev, txq); 3256 goto out; 3257 } 3258 3259 /* The device has no queue. Common case for software devices: 3260 loopback, all the sorts of tunnels... 3261 3262 Really, it is unlikely that netif_tx_lock protection is necessary 3263 here. (f.e. loopback and IP tunnels are clean ignoring statistics 3264 counters.) 3265 However, it is possible, that they rely on protection 3266 made by us here. 3267 3268 Check this and shot the lock. It is not prone from deadlocks. 3269 Either shot noqueue qdisc, it is even simpler 8) 3270 */ 3271 if (dev->flags & IFF_UP) { 3272 int cpu = smp_processor_id(); /* ok because BHs are off */ 3273 3274 if (txq->xmit_lock_owner != cpu) { 3275 3276 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) 3277 goto recursion_alert; 3278 3279 skb = validate_xmit_skb(skb, dev); 3280 if (!skb) 3281 goto drop; 3282 3283 HARD_TX_LOCK(dev, txq, cpu); 3284 3285 if (!netif_xmit_stopped(txq)) { 3286 __this_cpu_inc(xmit_recursion); 3287 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 3288 __this_cpu_dec(xmit_recursion); 3289 if (dev_xmit_complete(rc)) { 3290 HARD_TX_UNLOCK(dev, txq); 3291 goto out; 3292 } 3293 } 3294 HARD_TX_UNLOCK(dev, txq); 3295 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 3296 dev->name); 3297 } else { 3298 /* Recursion is detected! It is possible, 3299 * unfortunately 3300 */ 3301 recursion_alert: 3302 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 3303 dev->name); 3304 } 3305 } 3306 3307 rc = -ENETDOWN; 3308 drop: 3309 rcu_read_unlock_bh(); 3310 3311 atomic_long_inc(&dev->tx_dropped); 3312 kfree_skb_list(skb); 3313 return rc; 3314 out: 3315 rcu_read_unlock_bh(); 3316 return rc; 3317 } 3318 3319 int dev_queue_xmit(struct sk_buff *skb) 3320 { 3321 return __dev_queue_xmit(skb, NULL); 3322 } 3323 EXPORT_SYMBOL(dev_queue_xmit); 3324 3325 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3326 { 3327 return __dev_queue_xmit(skb, accel_priv); 3328 } 3329 EXPORT_SYMBOL(dev_queue_xmit_accel); 3330 3331 3332 /*======================================================================= 3333 Receiver routines 3334 =======================================================================*/ 3335 3336 int netdev_max_backlog __read_mostly = 1000; 3337 EXPORT_SYMBOL(netdev_max_backlog); 3338 3339 int netdev_tstamp_prequeue __read_mostly = 1; 3340 int netdev_budget __read_mostly = 300; 3341 int weight_p __read_mostly = 64; /* old backlog weight */ 3342 3343 /* Called with irq disabled */ 3344 static inline void ____napi_schedule(struct softnet_data *sd, 3345 struct napi_struct *napi) 3346 { 3347 list_add_tail(&napi->poll_list, &sd->poll_list); 3348 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3349 } 3350 3351 #ifdef CONFIG_RPS 3352 3353 /* One global table that all flow-based protocols share. */ 3354 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3355 EXPORT_SYMBOL(rps_sock_flow_table); 3356 u32 rps_cpu_mask __read_mostly; 3357 EXPORT_SYMBOL(rps_cpu_mask); 3358 3359 struct static_key rps_needed __read_mostly; 3360 3361 static struct rps_dev_flow * 3362 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3363 struct rps_dev_flow *rflow, u16 next_cpu) 3364 { 3365 if (next_cpu < nr_cpu_ids) { 3366 #ifdef CONFIG_RFS_ACCEL 3367 struct netdev_rx_queue *rxqueue; 3368 struct rps_dev_flow_table *flow_table; 3369 struct rps_dev_flow *old_rflow; 3370 u32 flow_id; 3371 u16 rxq_index; 3372 int rc; 3373 3374 /* Should we steer this flow to a different hardware queue? */ 3375 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3376 !(dev->features & NETIF_F_NTUPLE)) 3377 goto out; 3378 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3379 if (rxq_index == skb_get_rx_queue(skb)) 3380 goto out; 3381 3382 rxqueue = dev->_rx + rxq_index; 3383 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3384 if (!flow_table) 3385 goto out; 3386 flow_id = skb_get_hash(skb) & flow_table->mask; 3387 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3388 rxq_index, flow_id); 3389 if (rc < 0) 3390 goto out; 3391 old_rflow = rflow; 3392 rflow = &flow_table->flows[flow_id]; 3393 rflow->filter = rc; 3394 if (old_rflow->filter == rflow->filter) 3395 old_rflow->filter = RPS_NO_FILTER; 3396 out: 3397 #endif 3398 rflow->last_qtail = 3399 per_cpu(softnet_data, next_cpu).input_queue_head; 3400 } 3401 3402 rflow->cpu = next_cpu; 3403 return rflow; 3404 } 3405 3406 /* 3407 * get_rps_cpu is called from netif_receive_skb and returns the target 3408 * CPU from the RPS map of the receiving queue for a given skb. 3409 * rcu_read_lock must be held on entry. 3410 */ 3411 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3412 struct rps_dev_flow **rflowp) 3413 { 3414 const struct rps_sock_flow_table *sock_flow_table; 3415 struct netdev_rx_queue *rxqueue = dev->_rx; 3416 struct rps_dev_flow_table *flow_table; 3417 struct rps_map *map; 3418 int cpu = -1; 3419 u32 tcpu; 3420 u32 hash; 3421 3422 if (skb_rx_queue_recorded(skb)) { 3423 u16 index = skb_get_rx_queue(skb); 3424 3425 if (unlikely(index >= dev->real_num_rx_queues)) { 3426 WARN_ONCE(dev->real_num_rx_queues > 1, 3427 "%s received packet on queue %u, but number " 3428 "of RX queues is %u\n", 3429 dev->name, index, dev->real_num_rx_queues); 3430 goto done; 3431 } 3432 rxqueue += index; 3433 } 3434 3435 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3436 3437 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3438 map = rcu_dereference(rxqueue->rps_map); 3439 if (!flow_table && !map) 3440 goto done; 3441 3442 skb_reset_network_header(skb); 3443 hash = skb_get_hash(skb); 3444 if (!hash) 3445 goto done; 3446 3447 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3448 if (flow_table && sock_flow_table) { 3449 struct rps_dev_flow *rflow; 3450 u32 next_cpu; 3451 u32 ident; 3452 3453 /* First check into global flow table if there is a match */ 3454 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3455 if ((ident ^ hash) & ~rps_cpu_mask) 3456 goto try_rps; 3457 3458 next_cpu = ident & rps_cpu_mask; 3459 3460 /* OK, now we know there is a match, 3461 * we can look at the local (per receive queue) flow table 3462 */ 3463 rflow = &flow_table->flows[hash & flow_table->mask]; 3464 tcpu = rflow->cpu; 3465 3466 /* 3467 * If the desired CPU (where last recvmsg was done) is 3468 * different from current CPU (one in the rx-queue flow 3469 * table entry), switch if one of the following holds: 3470 * - Current CPU is unset (>= nr_cpu_ids). 3471 * - Current CPU is offline. 3472 * - The current CPU's queue tail has advanced beyond the 3473 * last packet that was enqueued using this table entry. 3474 * This guarantees that all previous packets for the flow 3475 * have been dequeued, thus preserving in order delivery. 3476 */ 3477 if (unlikely(tcpu != next_cpu) && 3478 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || 3479 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3480 rflow->last_qtail)) >= 0)) { 3481 tcpu = next_cpu; 3482 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3483 } 3484 3485 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { 3486 *rflowp = rflow; 3487 cpu = tcpu; 3488 goto done; 3489 } 3490 } 3491 3492 try_rps: 3493 3494 if (map) { 3495 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3496 if (cpu_online(tcpu)) { 3497 cpu = tcpu; 3498 goto done; 3499 } 3500 } 3501 3502 done: 3503 return cpu; 3504 } 3505 3506 #ifdef CONFIG_RFS_ACCEL 3507 3508 /** 3509 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3510 * @dev: Device on which the filter was set 3511 * @rxq_index: RX queue index 3512 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3513 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3514 * 3515 * Drivers that implement ndo_rx_flow_steer() should periodically call 3516 * this function for each installed filter and remove the filters for 3517 * which it returns %true. 3518 */ 3519 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3520 u32 flow_id, u16 filter_id) 3521 { 3522 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3523 struct rps_dev_flow_table *flow_table; 3524 struct rps_dev_flow *rflow; 3525 bool expire = true; 3526 unsigned int cpu; 3527 3528 rcu_read_lock(); 3529 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3530 if (flow_table && flow_id <= flow_table->mask) { 3531 rflow = &flow_table->flows[flow_id]; 3532 cpu = ACCESS_ONCE(rflow->cpu); 3533 if (rflow->filter == filter_id && cpu < nr_cpu_ids && 3534 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3535 rflow->last_qtail) < 3536 (int)(10 * flow_table->mask))) 3537 expire = false; 3538 } 3539 rcu_read_unlock(); 3540 return expire; 3541 } 3542 EXPORT_SYMBOL(rps_may_expire_flow); 3543 3544 #endif /* CONFIG_RFS_ACCEL */ 3545 3546 /* Called from hardirq (IPI) context */ 3547 static void rps_trigger_softirq(void *data) 3548 { 3549 struct softnet_data *sd = data; 3550 3551 ____napi_schedule(sd, &sd->backlog); 3552 sd->received_rps++; 3553 } 3554 3555 #endif /* CONFIG_RPS */ 3556 3557 /* 3558 * Check if this softnet_data structure is another cpu one 3559 * If yes, queue it to our IPI list and return 1 3560 * If no, return 0 3561 */ 3562 static int rps_ipi_queued(struct softnet_data *sd) 3563 { 3564 #ifdef CONFIG_RPS 3565 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3566 3567 if (sd != mysd) { 3568 sd->rps_ipi_next = mysd->rps_ipi_list; 3569 mysd->rps_ipi_list = sd; 3570 3571 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3572 return 1; 3573 } 3574 #endif /* CONFIG_RPS */ 3575 return 0; 3576 } 3577 3578 #ifdef CONFIG_NET_FLOW_LIMIT 3579 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3580 #endif 3581 3582 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3583 { 3584 #ifdef CONFIG_NET_FLOW_LIMIT 3585 struct sd_flow_limit *fl; 3586 struct softnet_data *sd; 3587 unsigned int old_flow, new_flow; 3588 3589 if (qlen < (netdev_max_backlog >> 1)) 3590 return false; 3591 3592 sd = this_cpu_ptr(&softnet_data); 3593 3594 rcu_read_lock(); 3595 fl = rcu_dereference(sd->flow_limit); 3596 if (fl) { 3597 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3598 old_flow = fl->history[fl->history_head]; 3599 fl->history[fl->history_head] = new_flow; 3600 3601 fl->history_head++; 3602 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3603 3604 if (likely(fl->buckets[old_flow])) 3605 fl->buckets[old_flow]--; 3606 3607 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3608 fl->count++; 3609 rcu_read_unlock(); 3610 return true; 3611 } 3612 } 3613 rcu_read_unlock(); 3614 #endif 3615 return false; 3616 } 3617 3618 /* 3619 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3620 * queue (may be a remote CPU queue). 3621 */ 3622 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3623 unsigned int *qtail) 3624 { 3625 struct softnet_data *sd; 3626 unsigned long flags; 3627 unsigned int qlen; 3628 3629 sd = &per_cpu(softnet_data, cpu); 3630 3631 local_irq_save(flags); 3632 3633 rps_lock(sd); 3634 if (!netif_running(skb->dev)) 3635 goto drop; 3636 qlen = skb_queue_len(&sd->input_pkt_queue); 3637 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3638 if (qlen) { 3639 enqueue: 3640 __skb_queue_tail(&sd->input_pkt_queue, skb); 3641 input_queue_tail_incr_save(sd, qtail); 3642 rps_unlock(sd); 3643 local_irq_restore(flags); 3644 return NET_RX_SUCCESS; 3645 } 3646 3647 /* Schedule NAPI for backlog device 3648 * We can use non atomic operation since we own the queue lock 3649 */ 3650 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3651 if (!rps_ipi_queued(sd)) 3652 ____napi_schedule(sd, &sd->backlog); 3653 } 3654 goto enqueue; 3655 } 3656 3657 drop: 3658 sd->dropped++; 3659 rps_unlock(sd); 3660 3661 local_irq_restore(flags); 3662 3663 atomic_long_inc(&skb->dev->rx_dropped); 3664 kfree_skb(skb); 3665 return NET_RX_DROP; 3666 } 3667 3668 static int netif_rx_internal(struct sk_buff *skb) 3669 { 3670 int ret; 3671 3672 net_timestamp_check(netdev_tstamp_prequeue, skb); 3673 3674 trace_netif_rx(skb); 3675 #ifdef CONFIG_RPS 3676 if (static_key_false(&rps_needed)) { 3677 struct rps_dev_flow voidflow, *rflow = &voidflow; 3678 int cpu; 3679 3680 preempt_disable(); 3681 rcu_read_lock(); 3682 3683 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3684 if (cpu < 0) 3685 cpu = smp_processor_id(); 3686 3687 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3688 3689 rcu_read_unlock(); 3690 preempt_enable(); 3691 } else 3692 #endif 3693 { 3694 unsigned int qtail; 3695 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3696 put_cpu(); 3697 } 3698 return ret; 3699 } 3700 3701 /** 3702 * netif_rx - post buffer to the network code 3703 * @skb: buffer to post 3704 * 3705 * This function receives a packet from a device driver and queues it for 3706 * the upper (protocol) levels to process. It always succeeds. The buffer 3707 * may be dropped during processing for congestion control or by the 3708 * protocol layers. 3709 * 3710 * return values: 3711 * NET_RX_SUCCESS (no congestion) 3712 * NET_RX_DROP (packet was dropped) 3713 * 3714 */ 3715 3716 int netif_rx(struct sk_buff *skb) 3717 { 3718 trace_netif_rx_entry(skb); 3719 3720 return netif_rx_internal(skb); 3721 } 3722 EXPORT_SYMBOL(netif_rx); 3723 3724 int netif_rx_ni(struct sk_buff *skb) 3725 { 3726 int err; 3727 3728 trace_netif_rx_ni_entry(skb); 3729 3730 preempt_disable(); 3731 err = netif_rx_internal(skb); 3732 if (local_softirq_pending()) 3733 do_softirq(); 3734 preempt_enable(); 3735 3736 return err; 3737 } 3738 EXPORT_SYMBOL(netif_rx_ni); 3739 3740 static void net_tx_action(struct softirq_action *h) 3741 { 3742 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3743 3744 if (sd->completion_queue) { 3745 struct sk_buff *clist; 3746 3747 local_irq_disable(); 3748 clist = sd->completion_queue; 3749 sd->completion_queue = NULL; 3750 local_irq_enable(); 3751 3752 while (clist) { 3753 struct sk_buff *skb = clist; 3754 clist = clist->next; 3755 3756 WARN_ON(atomic_read(&skb->users)); 3757 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3758 trace_consume_skb(skb); 3759 else 3760 trace_kfree_skb(skb, net_tx_action); 3761 __kfree_skb(skb); 3762 } 3763 } 3764 3765 if (sd->output_queue) { 3766 struct Qdisc *head; 3767 3768 local_irq_disable(); 3769 head = sd->output_queue; 3770 sd->output_queue = NULL; 3771 sd->output_queue_tailp = &sd->output_queue; 3772 local_irq_enable(); 3773 3774 while (head) { 3775 struct Qdisc *q = head; 3776 spinlock_t *root_lock; 3777 3778 head = head->next_sched; 3779 3780 root_lock = qdisc_lock(q); 3781 if (spin_trylock(root_lock)) { 3782 smp_mb__before_atomic(); 3783 clear_bit(__QDISC_STATE_SCHED, 3784 &q->state); 3785 qdisc_run(q); 3786 spin_unlock(root_lock); 3787 } else { 3788 if (!test_bit(__QDISC_STATE_DEACTIVATED, 3789 &q->state)) { 3790 __netif_reschedule(q); 3791 } else { 3792 smp_mb__before_atomic(); 3793 clear_bit(__QDISC_STATE_SCHED, 3794 &q->state); 3795 } 3796 } 3797 } 3798 } 3799 } 3800 3801 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 3802 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 3803 /* This hook is defined here for ATM LANE */ 3804 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3805 unsigned char *addr) __read_mostly; 3806 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3807 #endif 3808 3809 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 3810 struct packet_type **pt_prev, 3811 int *ret, struct net_device *orig_dev) 3812 { 3813 #ifdef CONFIG_NET_CLS_ACT 3814 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); 3815 struct tcf_result cl_res; 3816 3817 /* If there's at least one ingress present somewhere (so 3818 * we get here via enabled static key), remaining devices 3819 * that are not configured with an ingress qdisc will bail 3820 * out here. 3821 */ 3822 if (!cl) 3823 return skb; 3824 if (*pt_prev) { 3825 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3826 *pt_prev = NULL; 3827 } 3828 3829 qdisc_skb_cb(skb)->pkt_len = skb->len; 3830 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3831 qdisc_bstats_cpu_update(cl->q, skb); 3832 3833 switch (tc_classify(skb, cl, &cl_res, false)) { 3834 case TC_ACT_OK: 3835 case TC_ACT_RECLASSIFY: 3836 skb->tc_index = TC_H_MIN(cl_res.classid); 3837 break; 3838 case TC_ACT_SHOT: 3839 qdisc_qstats_cpu_drop(cl->q); 3840 case TC_ACT_STOLEN: 3841 case TC_ACT_QUEUED: 3842 kfree_skb(skb); 3843 return NULL; 3844 case TC_ACT_REDIRECT: 3845 /* skb_mac_header check was done by cls/act_bpf, so 3846 * we can safely push the L2 header back before 3847 * redirecting to another netdev 3848 */ 3849 __skb_push(skb, skb->mac_len); 3850 skb_do_redirect(skb); 3851 return NULL; 3852 default: 3853 break; 3854 } 3855 #endif /* CONFIG_NET_CLS_ACT */ 3856 return skb; 3857 } 3858 3859 /** 3860 * netdev_rx_handler_register - register receive handler 3861 * @dev: device to register a handler for 3862 * @rx_handler: receive handler to register 3863 * @rx_handler_data: data pointer that is used by rx handler 3864 * 3865 * Register a receive handler for a device. This handler will then be 3866 * called from __netif_receive_skb. A negative errno code is returned 3867 * on a failure. 3868 * 3869 * The caller must hold the rtnl_mutex. 3870 * 3871 * For a general description of rx_handler, see enum rx_handler_result. 3872 */ 3873 int netdev_rx_handler_register(struct net_device *dev, 3874 rx_handler_func_t *rx_handler, 3875 void *rx_handler_data) 3876 { 3877 ASSERT_RTNL(); 3878 3879 if (dev->rx_handler) 3880 return -EBUSY; 3881 3882 /* Note: rx_handler_data must be set before rx_handler */ 3883 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 3884 rcu_assign_pointer(dev->rx_handler, rx_handler); 3885 3886 return 0; 3887 } 3888 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 3889 3890 /** 3891 * netdev_rx_handler_unregister - unregister receive handler 3892 * @dev: device to unregister a handler from 3893 * 3894 * Unregister a receive handler from a device. 3895 * 3896 * The caller must hold the rtnl_mutex. 3897 */ 3898 void netdev_rx_handler_unregister(struct net_device *dev) 3899 { 3900 3901 ASSERT_RTNL(); 3902 RCU_INIT_POINTER(dev->rx_handler, NULL); 3903 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 3904 * section has a guarantee to see a non NULL rx_handler_data 3905 * as well. 3906 */ 3907 synchronize_net(); 3908 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 3909 } 3910 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3911 3912 /* 3913 * Limit the use of PFMEMALLOC reserves to those protocols that implement 3914 * the special handling of PFMEMALLOC skbs. 3915 */ 3916 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 3917 { 3918 switch (skb->protocol) { 3919 case htons(ETH_P_ARP): 3920 case htons(ETH_P_IP): 3921 case htons(ETH_P_IPV6): 3922 case htons(ETH_P_8021Q): 3923 case htons(ETH_P_8021AD): 3924 return true; 3925 default: 3926 return false; 3927 } 3928 } 3929 3930 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, 3931 int *ret, struct net_device *orig_dev) 3932 { 3933 #ifdef CONFIG_NETFILTER_INGRESS 3934 if (nf_hook_ingress_active(skb)) { 3935 if (*pt_prev) { 3936 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3937 *pt_prev = NULL; 3938 } 3939 3940 return nf_hook_ingress(skb); 3941 } 3942 #endif /* CONFIG_NETFILTER_INGRESS */ 3943 return 0; 3944 } 3945 3946 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 3947 { 3948 struct packet_type *ptype, *pt_prev; 3949 rx_handler_func_t *rx_handler; 3950 struct net_device *orig_dev; 3951 bool deliver_exact = false; 3952 int ret = NET_RX_DROP; 3953 __be16 type; 3954 3955 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3956 3957 trace_netif_receive_skb(skb); 3958 3959 orig_dev = skb->dev; 3960 3961 skb_reset_network_header(skb); 3962 if (!skb_transport_header_was_set(skb)) 3963 skb_reset_transport_header(skb); 3964 skb_reset_mac_len(skb); 3965 3966 pt_prev = NULL; 3967 3968 another_round: 3969 skb->skb_iif = skb->dev->ifindex; 3970 3971 __this_cpu_inc(softnet_data.processed); 3972 3973 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 3974 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 3975 skb = skb_vlan_untag(skb); 3976 if (unlikely(!skb)) 3977 goto out; 3978 } 3979 3980 #ifdef CONFIG_NET_CLS_ACT 3981 if (skb->tc_verd & TC_NCLS) { 3982 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 3983 goto ncls; 3984 } 3985 #endif 3986 3987 if (pfmemalloc) 3988 goto skip_taps; 3989 3990 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3991 if (pt_prev) 3992 ret = deliver_skb(skb, pt_prev, orig_dev); 3993 pt_prev = ptype; 3994 } 3995 3996 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 3997 if (pt_prev) 3998 ret = deliver_skb(skb, pt_prev, orig_dev); 3999 pt_prev = ptype; 4000 } 4001 4002 skip_taps: 4003 #ifdef CONFIG_NET_INGRESS 4004 if (static_key_false(&ingress_needed)) { 4005 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 4006 if (!skb) 4007 goto out; 4008 4009 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) 4010 goto out; 4011 } 4012 #endif 4013 #ifdef CONFIG_NET_CLS_ACT 4014 skb->tc_verd = 0; 4015 ncls: 4016 #endif 4017 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 4018 goto drop; 4019 4020 if (skb_vlan_tag_present(skb)) { 4021 if (pt_prev) { 4022 ret = deliver_skb(skb, pt_prev, orig_dev); 4023 pt_prev = NULL; 4024 } 4025 if (vlan_do_receive(&skb)) 4026 goto another_round; 4027 else if (unlikely(!skb)) 4028 goto out; 4029 } 4030 4031 rx_handler = rcu_dereference(skb->dev->rx_handler); 4032 if (rx_handler) { 4033 if (pt_prev) { 4034 ret = deliver_skb(skb, pt_prev, orig_dev); 4035 pt_prev = NULL; 4036 } 4037 switch (rx_handler(&skb)) { 4038 case RX_HANDLER_CONSUMED: 4039 ret = NET_RX_SUCCESS; 4040 goto out; 4041 case RX_HANDLER_ANOTHER: 4042 goto another_round; 4043 case RX_HANDLER_EXACT: 4044 deliver_exact = true; 4045 case RX_HANDLER_PASS: 4046 break; 4047 default: 4048 BUG(); 4049 } 4050 } 4051 4052 if (unlikely(skb_vlan_tag_present(skb))) { 4053 if (skb_vlan_tag_get_id(skb)) 4054 skb->pkt_type = PACKET_OTHERHOST; 4055 /* Note: we might in the future use prio bits 4056 * and set skb->priority like in vlan_do_receive() 4057 * For the time being, just ignore Priority Code Point 4058 */ 4059 skb->vlan_tci = 0; 4060 } 4061 4062 type = skb->protocol; 4063 4064 /* deliver only exact match when indicated */ 4065 if (likely(!deliver_exact)) { 4066 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4067 &ptype_base[ntohs(type) & 4068 PTYPE_HASH_MASK]); 4069 } 4070 4071 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4072 &orig_dev->ptype_specific); 4073 4074 if (unlikely(skb->dev != orig_dev)) { 4075 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4076 &skb->dev->ptype_specific); 4077 } 4078 4079 if (pt_prev) { 4080 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 4081 goto drop; 4082 else 4083 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 4084 } else { 4085 drop: 4086 atomic_long_inc(&skb->dev->rx_dropped); 4087 kfree_skb(skb); 4088 /* Jamal, now you will not able to escape explaining 4089 * me how you were going to use this. :-) 4090 */ 4091 ret = NET_RX_DROP; 4092 } 4093 4094 out: 4095 return ret; 4096 } 4097 4098 static int __netif_receive_skb(struct sk_buff *skb) 4099 { 4100 int ret; 4101 4102 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 4103 unsigned long pflags = current->flags; 4104 4105 /* 4106 * PFMEMALLOC skbs are special, they should 4107 * - be delivered to SOCK_MEMALLOC sockets only 4108 * - stay away from userspace 4109 * - have bounded memory usage 4110 * 4111 * Use PF_MEMALLOC as this saves us from propagating the allocation 4112 * context down to all allocation sites. 4113 */ 4114 current->flags |= PF_MEMALLOC; 4115 ret = __netif_receive_skb_core(skb, true); 4116 tsk_restore_flags(current, pflags, PF_MEMALLOC); 4117 } else 4118 ret = __netif_receive_skb_core(skb, false); 4119 4120 return ret; 4121 } 4122 4123 static int netif_receive_skb_internal(struct sk_buff *skb) 4124 { 4125 int ret; 4126 4127 net_timestamp_check(netdev_tstamp_prequeue, skb); 4128 4129 if (skb_defer_rx_timestamp(skb)) 4130 return NET_RX_SUCCESS; 4131 4132 rcu_read_lock(); 4133 4134 #ifdef CONFIG_RPS 4135 if (static_key_false(&rps_needed)) { 4136 struct rps_dev_flow voidflow, *rflow = &voidflow; 4137 int cpu = get_rps_cpu(skb->dev, skb, &rflow); 4138 4139 if (cpu >= 0) { 4140 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 4141 rcu_read_unlock(); 4142 return ret; 4143 } 4144 } 4145 #endif 4146 ret = __netif_receive_skb(skb); 4147 rcu_read_unlock(); 4148 return ret; 4149 } 4150 4151 /** 4152 * netif_receive_skb - process receive buffer from network 4153 * @skb: buffer to process 4154 * 4155 * netif_receive_skb() is the main receive data processing function. 4156 * It always succeeds. The buffer may be dropped during processing 4157 * for congestion control or by the protocol layers. 4158 * 4159 * This function may only be called from softirq context and interrupts 4160 * should be enabled. 4161 * 4162 * Return values (usually ignored): 4163 * NET_RX_SUCCESS: no congestion 4164 * NET_RX_DROP: packet was dropped 4165 */ 4166 int netif_receive_skb(struct sk_buff *skb) 4167 { 4168 trace_netif_receive_skb_entry(skb); 4169 4170 return netif_receive_skb_internal(skb); 4171 } 4172 EXPORT_SYMBOL(netif_receive_skb); 4173 4174 /* Network device is going away, flush any packets still pending 4175 * Called with irqs disabled. 4176 */ 4177 static void flush_backlog(void *arg) 4178 { 4179 struct net_device *dev = arg; 4180 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4181 struct sk_buff *skb, *tmp; 4182 4183 rps_lock(sd); 4184 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4185 if (skb->dev == dev) { 4186 __skb_unlink(skb, &sd->input_pkt_queue); 4187 kfree_skb(skb); 4188 input_queue_head_incr(sd); 4189 } 4190 } 4191 rps_unlock(sd); 4192 4193 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4194 if (skb->dev == dev) { 4195 __skb_unlink(skb, &sd->process_queue); 4196 kfree_skb(skb); 4197 input_queue_head_incr(sd); 4198 } 4199 } 4200 } 4201 4202 static int napi_gro_complete(struct sk_buff *skb) 4203 { 4204 struct packet_offload *ptype; 4205 __be16 type = skb->protocol; 4206 struct list_head *head = &offload_base; 4207 int err = -ENOENT; 4208 4209 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 4210 4211 if (NAPI_GRO_CB(skb)->count == 1) { 4212 skb_shinfo(skb)->gso_size = 0; 4213 goto out; 4214 } 4215 4216 rcu_read_lock(); 4217 list_for_each_entry_rcu(ptype, head, list) { 4218 if (ptype->type != type || !ptype->callbacks.gro_complete) 4219 continue; 4220 4221 err = ptype->callbacks.gro_complete(skb, 0); 4222 break; 4223 } 4224 rcu_read_unlock(); 4225 4226 if (err) { 4227 WARN_ON(&ptype->list == head); 4228 kfree_skb(skb); 4229 return NET_RX_SUCCESS; 4230 } 4231 4232 out: 4233 return netif_receive_skb_internal(skb); 4234 } 4235 4236 /* napi->gro_list contains packets ordered by age. 4237 * youngest packets at the head of it. 4238 * Complete skbs in reverse order to reduce latencies. 4239 */ 4240 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 4241 { 4242 struct sk_buff *skb, *prev = NULL; 4243 4244 /* scan list and build reverse chain */ 4245 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 4246 skb->prev = prev; 4247 prev = skb; 4248 } 4249 4250 for (skb = prev; skb; skb = prev) { 4251 skb->next = NULL; 4252 4253 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 4254 return; 4255 4256 prev = skb->prev; 4257 napi_gro_complete(skb); 4258 napi->gro_count--; 4259 } 4260 4261 napi->gro_list = NULL; 4262 } 4263 EXPORT_SYMBOL(napi_gro_flush); 4264 4265 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 4266 { 4267 struct sk_buff *p; 4268 unsigned int maclen = skb->dev->hard_header_len; 4269 u32 hash = skb_get_hash_raw(skb); 4270 4271 for (p = napi->gro_list; p; p = p->next) { 4272 unsigned long diffs; 4273 4274 NAPI_GRO_CB(p)->flush = 0; 4275 4276 if (hash != skb_get_hash_raw(p)) { 4277 NAPI_GRO_CB(p)->same_flow = 0; 4278 continue; 4279 } 4280 4281 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 4282 diffs |= p->vlan_tci ^ skb->vlan_tci; 4283 if (maclen == ETH_HLEN) 4284 diffs |= compare_ether_header(skb_mac_header(p), 4285 skb_mac_header(skb)); 4286 else if (!diffs) 4287 diffs = memcmp(skb_mac_header(p), 4288 skb_mac_header(skb), 4289 maclen); 4290 NAPI_GRO_CB(p)->same_flow = !diffs; 4291 } 4292 } 4293 4294 static void skb_gro_reset_offset(struct sk_buff *skb) 4295 { 4296 const struct skb_shared_info *pinfo = skb_shinfo(skb); 4297 const skb_frag_t *frag0 = &pinfo->frags[0]; 4298 4299 NAPI_GRO_CB(skb)->data_offset = 0; 4300 NAPI_GRO_CB(skb)->frag0 = NULL; 4301 NAPI_GRO_CB(skb)->frag0_len = 0; 4302 4303 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 4304 pinfo->nr_frags && 4305 !PageHighMem(skb_frag_page(frag0))) { 4306 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 4307 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 4308 } 4309 } 4310 4311 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 4312 { 4313 struct skb_shared_info *pinfo = skb_shinfo(skb); 4314 4315 BUG_ON(skb->end - skb->tail < grow); 4316 4317 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 4318 4319 skb->data_len -= grow; 4320 skb->tail += grow; 4321 4322 pinfo->frags[0].page_offset += grow; 4323 skb_frag_size_sub(&pinfo->frags[0], grow); 4324 4325 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 4326 skb_frag_unref(skb, 0); 4327 memmove(pinfo->frags, pinfo->frags + 1, 4328 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 4329 } 4330 } 4331 4332 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4333 { 4334 struct sk_buff **pp = NULL; 4335 struct packet_offload *ptype; 4336 __be16 type = skb->protocol; 4337 struct list_head *head = &offload_base; 4338 int same_flow; 4339 enum gro_result ret; 4340 int grow; 4341 4342 if (!(skb->dev->features & NETIF_F_GRO)) 4343 goto normal; 4344 4345 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4346 goto normal; 4347 4348 gro_list_prepare(napi, skb); 4349 4350 rcu_read_lock(); 4351 list_for_each_entry_rcu(ptype, head, list) { 4352 if (ptype->type != type || !ptype->callbacks.gro_receive) 4353 continue; 4354 4355 skb_set_network_header(skb, skb_gro_offset(skb)); 4356 skb_reset_mac_len(skb); 4357 NAPI_GRO_CB(skb)->same_flow = 0; 4358 NAPI_GRO_CB(skb)->flush = 0; 4359 NAPI_GRO_CB(skb)->free = 0; 4360 NAPI_GRO_CB(skb)->udp_mark = 0; 4361 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4362 4363 /* Setup for GRO checksum validation */ 4364 switch (skb->ip_summed) { 4365 case CHECKSUM_COMPLETE: 4366 NAPI_GRO_CB(skb)->csum = skb->csum; 4367 NAPI_GRO_CB(skb)->csum_valid = 1; 4368 NAPI_GRO_CB(skb)->csum_cnt = 0; 4369 break; 4370 case CHECKSUM_UNNECESSARY: 4371 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4372 NAPI_GRO_CB(skb)->csum_valid = 0; 4373 break; 4374 default: 4375 NAPI_GRO_CB(skb)->csum_cnt = 0; 4376 NAPI_GRO_CB(skb)->csum_valid = 0; 4377 } 4378 4379 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4380 break; 4381 } 4382 rcu_read_unlock(); 4383 4384 if (&ptype->list == head) 4385 goto normal; 4386 4387 same_flow = NAPI_GRO_CB(skb)->same_flow; 4388 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4389 4390 if (pp) { 4391 struct sk_buff *nskb = *pp; 4392 4393 *pp = nskb->next; 4394 nskb->next = NULL; 4395 napi_gro_complete(nskb); 4396 napi->gro_count--; 4397 } 4398 4399 if (same_flow) 4400 goto ok; 4401 4402 if (NAPI_GRO_CB(skb)->flush) 4403 goto normal; 4404 4405 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4406 struct sk_buff *nskb = napi->gro_list; 4407 4408 /* locate the end of the list to select the 'oldest' flow */ 4409 while (nskb->next) { 4410 pp = &nskb->next; 4411 nskb = *pp; 4412 } 4413 *pp = NULL; 4414 nskb->next = NULL; 4415 napi_gro_complete(nskb); 4416 } else { 4417 napi->gro_count++; 4418 } 4419 NAPI_GRO_CB(skb)->count = 1; 4420 NAPI_GRO_CB(skb)->age = jiffies; 4421 NAPI_GRO_CB(skb)->last = skb; 4422 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4423 skb->next = napi->gro_list; 4424 napi->gro_list = skb; 4425 ret = GRO_HELD; 4426 4427 pull: 4428 grow = skb_gro_offset(skb) - skb_headlen(skb); 4429 if (grow > 0) 4430 gro_pull_from_frag0(skb, grow); 4431 ok: 4432 return ret; 4433 4434 normal: 4435 ret = GRO_NORMAL; 4436 goto pull; 4437 } 4438 4439 struct packet_offload *gro_find_receive_by_type(__be16 type) 4440 { 4441 struct list_head *offload_head = &offload_base; 4442 struct packet_offload *ptype; 4443 4444 list_for_each_entry_rcu(ptype, offload_head, list) { 4445 if (ptype->type != type || !ptype->callbacks.gro_receive) 4446 continue; 4447 return ptype; 4448 } 4449 return NULL; 4450 } 4451 EXPORT_SYMBOL(gro_find_receive_by_type); 4452 4453 struct packet_offload *gro_find_complete_by_type(__be16 type) 4454 { 4455 struct list_head *offload_head = &offload_base; 4456 struct packet_offload *ptype; 4457 4458 list_for_each_entry_rcu(ptype, offload_head, list) { 4459 if (ptype->type != type || !ptype->callbacks.gro_complete) 4460 continue; 4461 return ptype; 4462 } 4463 return NULL; 4464 } 4465 EXPORT_SYMBOL(gro_find_complete_by_type); 4466 4467 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4468 { 4469 switch (ret) { 4470 case GRO_NORMAL: 4471 if (netif_receive_skb_internal(skb)) 4472 ret = GRO_DROP; 4473 break; 4474 4475 case GRO_DROP: 4476 kfree_skb(skb); 4477 break; 4478 4479 case GRO_MERGED_FREE: 4480 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) 4481 kmem_cache_free(skbuff_head_cache, skb); 4482 else 4483 __kfree_skb(skb); 4484 break; 4485 4486 case GRO_HELD: 4487 case GRO_MERGED: 4488 break; 4489 } 4490 4491 return ret; 4492 } 4493 4494 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4495 { 4496 skb_mark_napi_id(skb, napi); 4497 trace_napi_gro_receive_entry(skb); 4498 4499 skb_gro_reset_offset(skb); 4500 4501 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4502 } 4503 EXPORT_SYMBOL(napi_gro_receive); 4504 4505 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4506 { 4507 if (unlikely(skb->pfmemalloc)) { 4508 consume_skb(skb); 4509 return; 4510 } 4511 __skb_pull(skb, skb_headlen(skb)); 4512 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4513 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4514 skb->vlan_tci = 0; 4515 skb->dev = napi->dev; 4516 skb->skb_iif = 0; 4517 skb->encapsulation = 0; 4518 skb_shinfo(skb)->gso_type = 0; 4519 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4520 4521 napi->skb = skb; 4522 } 4523 4524 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4525 { 4526 struct sk_buff *skb = napi->skb; 4527 4528 if (!skb) { 4529 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4530 if (skb) { 4531 napi->skb = skb; 4532 skb_mark_napi_id(skb, napi); 4533 } 4534 } 4535 return skb; 4536 } 4537 EXPORT_SYMBOL(napi_get_frags); 4538 4539 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4540 struct sk_buff *skb, 4541 gro_result_t ret) 4542 { 4543 switch (ret) { 4544 case GRO_NORMAL: 4545 case GRO_HELD: 4546 __skb_push(skb, ETH_HLEN); 4547 skb->protocol = eth_type_trans(skb, skb->dev); 4548 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4549 ret = GRO_DROP; 4550 break; 4551 4552 case GRO_DROP: 4553 case GRO_MERGED_FREE: 4554 napi_reuse_skb(napi, skb); 4555 break; 4556 4557 case GRO_MERGED: 4558 break; 4559 } 4560 4561 return ret; 4562 } 4563 4564 /* Upper GRO stack assumes network header starts at gro_offset=0 4565 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4566 * We copy ethernet header into skb->data to have a common layout. 4567 */ 4568 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4569 { 4570 struct sk_buff *skb = napi->skb; 4571 const struct ethhdr *eth; 4572 unsigned int hlen = sizeof(*eth); 4573 4574 napi->skb = NULL; 4575 4576 skb_reset_mac_header(skb); 4577 skb_gro_reset_offset(skb); 4578 4579 eth = skb_gro_header_fast(skb, 0); 4580 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4581 eth = skb_gro_header_slow(skb, hlen, 0); 4582 if (unlikely(!eth)) { 4583 napi_reuse_skb(napi, skb); 4584 return NULL; 4585 } 4586 } else { 4587 gro_pull_from_frag0(skb, hlen); 4588 NAPI_GRO_CB(skb)->frag0 += hlen; 4589 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4590 } 4591 __skb_pull(skb, hlen); 4592 4593 /* 4594 * This works because the only protocols we care about don't require 4595 * special handling. 4596 * We'll fix it up properly in napi_frags_finish() 4597 */ 4598 skb->protocol = eth->h_proto; 4599 4600 return skb; 4601 } 4602 4603 gro_result_t napi_gro_frags(struct napi_struct *napi) 4604 { 4605 struct sk_buff *skb = napi_frags_skb(napi); 4606 4607 if (!skb) 4608 return GRO_DROP; 4609 4610 trace_napi_gro_frags_entry(skb); 4611 4612 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4613 } 4614 EXPORT_SYMBOL(napi_gro_frags); 4615 4616 /* Compute the checksum from gro_offset and return the folded value 4617 * after adding in any pseudo checksum. 4618 */ 4619 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4620 { 4621 __wsum wsum; 4622 __sum16 sum; 4623 4624 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4625 4626 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4627 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4628 if (likely(!sum)) { 4629 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4630 !skb->csum_complete_sw) 4631 netdev_rx_csum_fault(skb->dev); 4632 } 4633 4634 NAPI_GRO_CB(skb)->csum = wsum; 4635 NAPI_GRO_CB(skb)->csum_valid = 1; 4636 4637 return sum; 4638 } 4639 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4640 4641 /* 4642 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4643 * Note: called with local irq disabled, but exits with local irq enabled. 4644 */ 4645 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4646 { 4647 #ifdef CONFIG_RPS 4648 struct softnet_data *remsd = sd->rps_ipi_list; 4649 4650 if (remsd) { 4651 sd->rps_ipi_list = NULL; 4652 4653 local_irq_enable(); 4654 4655 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4656 while (remsd) { 4657 struct softnet_data *next = remsd->rps_ipi_next; 4658 4659 if (cpu_online(remsd->cpu)) 4660 smp_call_function_single_async(remsd->cpu, 4661 &remsd->csd); 4662 remsd = next; 4663 } 4664 } else 4665 #endif 4666 local_irq_enable(); 4667 } 4668 4669 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4670 { 4671 #ifdef CONFIG_RPS 4672 return sd->rps_ipi_list != NULL; 4673 #else 4674 return false; 4675 #endif 4676 } 4677 4678 static int process_backlog(struct napi_struct *napi, int quota) 4679 { 4680 int work = 0; 4681 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4682 4683 /* Check if we have pending ipi, its better to send them now, 4684 * not waiting net_rx_action() end. 4685 */ 4686 if (sd_has_rps_ipi_waiting(sd)) { 4687 local_irq_disable(); 4688 net_rps_action_and_irq_enable(sd); 4689 } 4690 4691 napi->weight = weight_p; 4692 local_irq_disable(); 4693 while (1) { 4694 struct sk_buff *skb; 4695 4696 while ((skb = __skb_dequeue(&sd->process_queue))) { 4697 rcu_read_lock(); 4698 local_irq_enable(); 4699 __netif_receive_skb(skb); 4700 rcu_read_unlock(); 4701 local_irq_disable(); 4702 input_queue_head_incr(sd); 4703 if (++work >= quota) { 4704 local_irq_enable(); 4705 return work; 4706 } 4707 } 4708 4709 rps_lock(sd); 4710 if (skb_queue_empty(&sd->input_pkt_queue)) { 4711 /* 4712 * Inline a custom version of __napi_complete(). 4713 * only current cpu owns and manipulates this napi, 4714 * and NAPI_STATE_SCHED is the only possible flag set 4715 * on backlog. 4716 * We can use a plain write instead of clear_bit(), 4717 * and we dont need an smp_mb() memory barrier. 4718 */ 4719 napi->state = 0; 4720 rps_unlock(sd); 4721 4722 break; 4723 } 4724 4725 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4726 &sd->process_queue); 4727 rps_unlock(sd); 4728 } 4729 local_irq_enable(); 4730 4731 return work; 4732 } 4733 4734 /** 4735 * __napi_schedule - schedule for receive 4736 * @n: entry to schedule 4737 * 4738 * The entry's receive function will be scheduled to run. 4739 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4740 */ 4741 void __napi_schedule(struct napi_struct *n) 4742 { 4743 unsigned long flags; 4744 4745 local_irq_save(flags); 4746 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4747 local_irq_restore(flags); 4748 } 4749 EXPORT_SYMBOL(__napi_schedule); 4750 4751 /** 4752 * __napi_schedule_irqoff - schedule for receive 4753 * @n: entry to schedule 4754 * 4755 * Variant of __napi_schedule() assuming hard irqs are masked 4756 */ 4757 void __napi_schedule_irqoff(struct napi_struct *n) 4758 { 4759 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4760 } 4761 EXPORT_SYMBOL(__napi_schedule_irqoff); 4762 4763 void __napi_complete(struct napi_struct *n) 4764 { 4765 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4766 4767 list_del_init(&n->poll_list); 4768 smp_mb__before_atomic(); 4769 clear_bit(NAPI_STATE_SCHED, &n->state); 4770 } 4771 EXPORT_SYMBOL(__napi_complete); 4772 4773 void napi_complete_done(struct napi_struct *n, int work_done) 4774 { 4775 unsigned long flags; 4776 4777 /* 4778 * don't let napi dequeue from the cpu poll list 4779 * just in case its running on a different cpu 4780 */ 4781 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4782 return; 4783 4784 if (n->gro_list) { 4785 unsigned long timeout = 0; 4786 4787 if (work_done) 4788 timeout = n->dev->gro_flush_timeout; 4789 4790 if (timeout) 4791 hrtimer_start(&n->timer, ns_to_ktime(timeout), 4792 HRTIMER_MODE_REL_PINNED); 4793 else 4794 napi_gro_flush(n, false); 4795 } 4796 if (likely(list_empty(&n->poll_list))) { 4797 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4798 } else { 4799 /* If n->poll_list is not empty, we need to mask irqs */ 4800 local_irq_save(flags); 4801 __napi_complete(n); 4802 local_irq_restore(flags); 4803 } 4804 } 4805 EXPORT_SYMBOL(napi_complete_done); 4806 4807 /* must be called under rcu_read_lock(), as we dont take a reference */ 4808 static struct napi_struct *napi_by_id(unsigned int napi_id) 4809 { 4810 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 4811 struct napi_struct *napi; 4812 4813 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 4814 if (napi->napi_id == napi_id) 4815 return napi; 4816 4817 return NULL; 4818 } 4819 4820 #if defined(CONFIG_NET_RX_BUSY_POLL) 4821 #define BUSY_POLL_BUDGET 8 4822 bool sk_busy_loop(struct sock *sk, int nonblock) 4823 { 4824 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; 4825 int (*busy_poll)(struct napi_struct *dev); 4826 struct napi_struct *napi; 4827 int rc = false; 4828 4829 rcu_read_lock(); 4830 4831 napi = napi_by_id(sk->sk_napi_id); 4832 if (!napi) 4833 goto out; 4834 4835 /* Note: ndo_busy_poll method is optional in linux-4.5 */ 4836 busy_poll = napi->dev->netdev_ops->ndo_busy_poll; 4837 4838 do { 4839 rc = 0; 4840 local_bh_disable(); 4841 if (busy_poll) { 4842 rc = busy_poll(napi); 4843 } else if (napi_schedule_prep(napi)) { 4844 void *have = netpoll_poll_lock(napi); 4845 4846 if (test_bit(NAPI_STATE_SCHED, &napi->state)) { 4847 rc = napi->poll(napi, BUSY_POLL_BUDGET); 4848 trace_napi_poll(napi); 4849 if (rc == BUSY_POLL_BUDGET) { 4850 napi_complete_done(napi, rc); 4851 napi_schedule(napi); 4852 } 4853 } 4854 netpoll_poll_unlock(have); 4855 } 4856 if (rc > 0) 4857 NET_ADD_STATS_BH(sock_net(sk), 4858 LINUX_MIB_BUSYPOLLRXPACKETS, rc); 4859 local_bh_enable(); 4860 4861 if (rc == LL_FLUSH_FAILED) 4862 break; /* permanent failure */ 4863 4864 cpu_relax(); 4865 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && 4866 !need_resched() && !busy_loop_timeout(end_time)); 4867 4868 rc = !skb_queue_empty(&sk->sk_receive_queue); 4869 out: 4870 rcu_read_unlock(); 4871 return rc; 4872 } 4873 EXPORT_SYMBOL(sk_busy_loop); 4874 4875 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4876 4877 void napi_hash_add(struct napi_struct *napi) 4878 { 4879 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || 4880 test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) 4881 return; 4882 4883 spin_lock(&napi_hash_lock); 4884 4885 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ 4886 do { 4887 if (unlikely(++napi_gen_id < NR_CPUS + 1)) 4888 napi_gen_id = NR_CPUS + 1; 4889 } while (napi_by_id(napi_gen_id)); 4890 napi->napi_id = napi_gen_id; 4891 4892 hlist_add_head_rcu(&napi->napi_hash_node, 4893 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 4894 4895 spin_unlock(&napi_hash_lock); 4896 } 4897 EXPORT_SYMBOL_GPL(napi_hash_add); 4898 4899 /* Warning : caller is responsible to make sure rcu grace period 4900 * is respected before freeing memory containing @napi 4901 */ 4902 bool napi_hash_del(struct napi_struct *napi) 4903 { 4904 bool rcu_sync_needed = false; 4905 4906 spin_lock(&napi_hash_lock); 4907 4908 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { 4909 rcu_sync_needed = true; 4910 hlist_del_rcu(&napi->napi_hash_node); 4911 } 4912 spin_unlock(&napi_hash_lock); 4913 return rcu_sync_needed; 4914 } 4915 EXPORT_SYMBOL_GPL(napi_hash_del); 4916 4917 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 4918 { 4919 struct napi_struct *napi; 4920 4921 napi = container_of(timer, struct napi_struct, timer); 4922 if (napi->gro_list) 4923 napi_schedule(napi); 4924 4925 return HRTIMER_NORESTART; 4926 } 4927 4928 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 4929 int (*poll)(struct napi_struct *, int), int weight) 4930 { 4931 INIT_LIST_HEAD(&napi->poll_list); 4932 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 4933 napi->timer.function = napi_watchdog; 4934 napi->gro_count = 0; 4935 napi->gro_list = NULL; 4936 napi->skb = NULL; 4937 napi->poll = poll; 4938 if (weight > NAPI_POLL_WEIGHT) 4939 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 4940 weight, dev->name); 4941 napi->weight = weight; 4942 list_add(&napi->dev_list, &dev->napi_list); 4943 napi->dev = dev; 4944 #ifdef CONFIG_NETPOLL 4945 spin_lock_init(&napi->poll_lock); 4946 napi->poll_owner = -1; 4947 #endif 4948 set_bit(NAPI_STATE_SCHED, &napi->state); 4949 napi_hash_add(napi); 4950 } 4951 EXPORT_SYMBOL(netif_napi_add); 4952 4953 void napi_disable(struct napi_struct *n) 4954 { 4955 might_sleep(); 4956 set_bit(NAPI_STATE_DISABLE, &n->state); 4957 4958 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 4959 msleep(1); 4960 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) 4961 msleep(1); 4962 4963 hrtimer_cancel(&n->timer); 4964 4965 clear_bit(NAPI_STATE_DISABLE, &n->state); 4966 } 4967 EXPORT_SYMBOL(napi_disable); 4968 4969 /* Must be called in process context */ 4970 void netif_napi_del(struct napi_struct *napi) 4971 { 4972 might_sleep(); 4973 if (napi_hash_del(napi)) 4974 synchronize_net(); 4975 list_del_init(&napi->dev_list); 4976 napi_free_frags(napi); 4977 4978 kfree_skb_list(napi->gro_list); 4979 napi->gro_list = NULL; 4980 napi->gro_count = 0; 4981 } 4982 EXPORT_SYMBOL(netif_napi_del); 4983 4984 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 4985 { 4986 void *have; 4987 int work, weight; 4988 4989 list_del_init(&n->poll_list); 4990 4991 have = netpoll_poll_lock(n); 4992 4993 weight = n->weight; 4994 4995 /* This NAPI_STATE_SCHED test is for avoiding a race 4996 * with netpoll's poll_napi(). Only the entity which 4997 * obtains the lock and sees NAPI_STATE_SCHED set will 4998 * actually make the ->poll() call. Therefore we avoid 4999 * accidentally calling ->poll() when NAPI is not scheduled. 5000 */ 5001 work = 0; 5002 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 5003 work = n->poll(n, weight); 5004 trace_napi_poll(n); 5005 } 5006 5007 WARN_ON_ONCE(work > weight); 5008 5009 if (likely(work < weight)) 5010 goto out_unlock; 5011 5012 /* Drivers must not modify the NAPI state if they 5013 * consume the entire weight. In such cases this code 5014 * still "owns" the NAPI instance and therefore can 5015 * move the instance around on the list at-will. 5016 */ 5017 if (unlikely(napi_disable_pending(n))) { 5018 napi_complete(n); 5019 goto out_unlock; 5020 } 5021 5022 if (n->gro_list) { 5023 /* flush too old packets 5024 * If HZ < 1000, flush all packets. 5025 */ 5026 napi_gro_flush(n, HZ >= 1000); 5027 } 5028 5029 /* Some drivers may have called napi_schedule 5030 * prior to exhausting their budget. 5031 */ 5032 if (unlikely(!list_empty(&n->poll_list))) { 5033 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 5034 n->dev ? n->dev->name : "backlog"); 5035 goto out_unlock; 5036 } 5037 5038 list_add_tail(&n->poll_list, repoll); 5039 5040 out_unlock: 5041 netpoll_poll_unlock(have); 5042 5043 return work; 5044 } 5045 5046 static void net_rx_action(struct softirq_action *h) 5047 { 5048 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 5049 unsigned long time_limit = jiffies + 2; 5050 int budget = netdev_budget; 5051 LIST_HEAD(list); 5052 LIST_HEAD(repoll); 5053 5054 local_irq_disable(); 5055 list_splice_init(&sd->poll_list, &list); 5056 local_irq_enable(); 5057 5058 for (;;) { 5059 struct napi_struct *n; 5060 5061 if (list_empty(&list)) { 5062 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 5063 return; 5064 break; 5065 } 5066 5067 n = list_first_entry(&list, struct napi_struct, poll_list); 5068 budget -= napi_poll(n, &repoll); 5069 5070 /* If softirq window is exhausted then punt. 5071 * Allow this to run for 2 jiffies since which will allow 5072 * an average latency of 1.5/HZ. 5073 */ 5074 if (unlikely(budget <= 0 || 5075 time_after_eq(jiffies, time_limit))) { 5076 sd->time_squeeze++; 5077 break; 5078 } 5079 } 5080 5081 local_irq_disable(); 5082 5083 list_splice_tail_init(&sd->poll_list, &list); 5084 list_splice_tail(&repoll, &list); 5085 list_splice(&list, &sd->poll_list); 5086 if (!list_empty(&sd->poll_list)) 5087 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 5088 5089 net_rps_action_and_irq_enable(sd); 5090 } 5091 5092 struct netdev_adjacent { 5093 struct net_device *dev; 5094 5095 /* upper master flag, there can only be one master device per list */ 5096 bool master; 5097 5098 /* counter for the number of times this device was added to us */ 5099 u16 ref_nr; 5100 5101 /* private field for the users */ 5102 void *private; 5103 5104 struct list_head list; 5105 struct rcu_head rcu; 5106 }; 5107 5108 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, 5109 struct list_head *adj_list) 5110 { 5111 struct netdev_adjacent *adj; 5112 5113 list_for_each_entry(adj, adj_list, list) { 5114 if (adj->dev == adj_dev) 5115 return adj; 5116 } 5117 return NULL; 5118 } 5119 5120 /** 5121 * netdev_has_upper_dev - Check if device is linked to an upper device 5122 * @dev: device 5123 * @upper_dev: upper device to check 5124 * 5125 * Find out if a device is linked to specified upper device and return true 5126 * in case it is. Note that this checks only immediate upper device, 5127 * not through a complete stack of devices. The caller must hold the RTNL lock. 5128 */ 5129 bool netdev_has_upper_dev(struct net_device *dev, 5130 struct net_device *upper_dev) 5131 { 5132 ASSERT_RTNL(); 5133 5134 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); 5135 } 5136 EXPORT_SYMBOL(netdev_has_upper_dev); 5137 5138 /** 5139 * netdev_has_any_upper_dev - Check if device is linked to some device 5140 * @dev: device 5141 * 5142 * Find out if a device is linked to an upper device and return true in case 5143 * it is. The caller must hold the RTNL lock. 5144 */ 5145 static bool netdev_has_any_upper_dev(struct net_device *dev) 5146 { 5147 ASSERT_RTNL(); 5148 5149 return !list_empty(&dev->all_adj_list.upper); 5150 } 5151 5152 /** 5153 * netdev_master_upper_dev_get - Get master upper device 5154 * @dev: device 5155 * 5156 * Find a master upper device and return pointer to it or NULL in case 5157 * it's not there. The caller must hold the RTNL lock. 5158 */ 5159 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 5160 { 5161 struct netdev_adjacent *upper; 5162 5163 ASSERT_RTNL(); 5164 5165 if (list_empty(&dev->adj_list.upper)) 5166 return NULL; 5167 5168 upper = list_first_entry(&dev->adj_list.upper, 5169 struct netdev_adjacent, list); 5170 if (likely(upper->master)) 5171 return upper->dev; 5172 return NULL; 5173 } 5174 EXPORT_SYMBOL(netdev_master_upper_dev_get); 5175 5176 void *netdev_adjacent_get_private(struct list_head *adj_list) 5177 { 5178 struct netdev_adjacent *adj; 5179 5180 adj = list_entry(adj_list, struct netdev_adjacent, list); 5181 5182 return adj->private; 5183 } 5184 EXPORT_SYMBOL(netdev_adjacent_get_private); 5185 5186 /** 5187 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 5188 * @dev: device 5189 * @iter: list_head ** of the current position 5190 * 5191 * Gets the next device from the dev's upper list, starting from iter 5192 * position. The caller must hold RCU read lock. 5193 */ 5194 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 5195 struct list_head **iter) 5196 { 5197 struct netdev_adjacent *upper; 5198 5199 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5200 5201 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5202 5203 if (&upper->list == &dev->adj_list.upper) 5204 return NULL; 5205 5206 *iter = &upper->list; 5207 5208 return upper->dev; 5209 } 5210 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 5211 5212 /** 5213 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 5214 * @dev: device 5215 * @iter: list_head ** of the current position 5216 * 5217 * Gets the next device from the dev's upper list, starting from iter 5218 * position. The caller must hold RCU read lock. 5219 */ 5220 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, 5221 struct list_head **iter) 5222 { 5223 struct netdev_adjacent *upper; 5224 5225 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5226 5227 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5228 5229 if (&upper->list == &dev->all_adj_list.upper) 5230 return NULL; 5231 5232 *iter = &upper->list; 5233 5234 return upper->dev; 5235 } 5236 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 5237 5238 /** 5239 * netdev_lower_get_next_private - Get the next ->private from the 5240 * lower neighbour list 5241 * @dev: device 5242 * @iter: list_head ** of the current position 5243 * 5244 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5245 * list, starting from iter position. The caller must hold either hold the 5246 * RTNL lock or its own locking that guarantees that the neighbour lower 5247 * list will remain unchanged. 5248 */ 5249 void *netdev_lower_get_next_private(struct net_device *dev, 5250 struct list_head **iter) 5251 { 5252 struct netdev_adjacent *lower; 5253 5254 lower = list_entry(*iter, struct netdev_adjacent, list); 5255 5256 if (&lower->list == &dev->adj_list.lower) 5257 return NULL; 5258 5259 *iter = lower->list.next; 5260 5261 return lower->private; 5262 } 5263 EXPORT_SYMBOL(netdev_lower_get_next_private); 5264 5265 /** 5266 * netdev_lower_get_next_private_rcu - Get the next ->private from the 5267 * lower neighbour list, RCU 5268 * variant 5269 * @dev: device 5270 * @iter: list_head ** of the current position 5271 * 5272 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5273 * list, starting from iter position. The caller must hold RCU read lock. 5274 */ 5275 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 5276 struct list_head **iter) 5277 { 5278 struct netdev_adjacent *lower; 5279 5280 WARN_ON_ONCE(!rcu_read_lock_held()); 5281 5282 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5283 5284 if (&lower->list == &dev->adj_list.lower) 5285 return NULL; 5286 5287 *iter = &lower->list; 5288 5289 return lower->private; 5290 } 5291 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 5292 5293 /** 5294 * netdev_lower_get_next - Get the next device from the lower neighbour 5295 * list 5296 * @dev: device 5297 * @iter: list_head ** of the current position 5298 * 5299 * Gets the next netdev_adjacent from the dev's lower neighbour 5300 * list, starting from iter position. The caller must hold RTNL lock or 5301 * its own locking that guarantees that the neighbour lower 5302 * list will remain unchanged. 5303 */ 5304 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 5305 { 5306 struct netdev_adjacent *lower; 5307 5308 lower = list_entry((*iter)->next, struct netdev_adjacent, list); 5309 5310 if (&lower->list == &dev->adj_list.lower) 5311 return NULL; 5312 5313 *iter = &lower->list; 5314 5315 return lower->dev; 5316 } 5317 EXPORT_SYMBOL(netdev_lower_get_next); 5318 5319 /** 5320 * netdev_lower_get_first_private_rcu - Get the first ->private from the 5321 * lower neighbour list, RCU 5322 * variant 5323 * @dev: device 5324 * 5325 * Gets the first netdev_adjacent->private from the dev's lower neighbour 5326 * list. The caller must hold RCU read lock. 5327 */ 5328 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 5329 { 5330 struct netdev_adjacent *lower; 5331 5332 lower = list_first_or_null_rcu(&dev->adj_list.lower, 5333 struct netdev_adjacent, list); 5334 if (lower) 5335 return lower->private; 5336 return NULL; 5337 } 5338 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 5339 5340 /** 5341 * netdev_master_upper_dev_get_rcu - Get master upper device 5342 * @dev: device 5343 * 5344 * Find a master upper device and return pointer to it or NULL in case 5345 * it's not there. The caller must hold the RCU read lock. 5346 */ 5347 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 5348 { 5349 struct netdev_adjacent *upper; 5350 5351 upper = list_first_or_null_rcu(&dev->adj_list.upper, 5352 struct netdev_adjacent, list); 5353 if (upper && likely(upper->master)) 5354 return upper->dev; 5355 return NULL; 5356 } 5357 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 5358 5359 static int netdev_adjacent_sysfs_add(struct net_device *dev, 5360 struct net_device *adj_dev, 5361 struct list_head *dev_list) 5362 { 5363 char linkname[IFNAMSIZ+7]; 5364 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5365 "upper_%s" : "lower_%s", adj_dev->name); 5366 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5367 linkname); 5368 } 5369 static void netdev_adjacent_sysfs_del(struct net_device *dev, 5370 char *name, 5371 struct list_head *dev_list) 5372 { 5373 char linkname[IFNAMSIZ+7]; 5374 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5375 "upper_%s" : "lower_%s", name); 5376 sysfs_remove_link(&(dev->dev.kobj), linkname); 5377 } 5378 5379 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 5380 struct net_device *adj_dev, 5381 struct list_head *dev_list) 5382 { 5383 return (dev_list == &dev->adj_list.upper || 5384 dev_list == &dev->adj_list.lower) && 5385 net_eq(dev_net(dev), dev_net(adj_dev)); 5386 } 5387 5388 static int __netdev_adjacent_dev_insert(struct net_device *dev, 5389 struct net_device *adj_dev, 5390 struct list_head *dev_list, 5391 void *private, bool master) 5392 { 5393 struct netdev_adjacent *adj; 5394 int ret; 5395 5396 adj = __netdev_find_adj(adj_dev, dev_list); 5397 5398 if (adj) { 5399 adj->ref_nr++; 5400 return 0; 5401 } 5402 5403 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5404 if (!adj) 5405 return -ENOMEM; 5406 5407 adj->dev = adj_dev; 5408 adj->master = master; 5409 adj->ref_nr = 1; 5410 adj->private = private; 5411 dev_hold(adj_dev); 5412 5413 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5414 adj_dev->name, dev->name, adj_dev->name); 5415 5416 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5417 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5418 if (ret) 5419 goto free_adj; 5420 } 5421 5422 /* Ensure that master link is always the first item in list. */ 5423 if (master) { 5424 ret = sysfs_create_link(&(dev->dev.kobj), 5425 &(adj_dev->dev.kobj), "master"); 5426 if (ret) 5427 goto remove_symlinks; 5428 5429 list_add_rcu(&adj->list, dev_list); 5430 } else { 5431 list_add_tail_rcu(&adj->list, dev_list); 5432 } 5433 5434 return 0; 5435 5436 remove_symlinks: 5437 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5438 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5439 free_adj: 5440 kfree(adj); 5441 dev_put(adj_dev); 5442 5443 return ret; 5444 } 5445 5446 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5447 struct net_device *adj_dev, 5448 struct list_head *dev_list) 5449 { 5450 struct netdev_adjacent *adj; 5451 5452 adj = __netdev_find_adj(adj_dev, dev_list); 5453 5454 if (!adj) { 5455 pr_err("tried to remove device %s from %s\n", 5456 dev->name, adj_dev->name); 5457 BUG(); 5458 } 5459 5460 if (adj->ref_nr > 1) { 5461 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, 5462 adj->ref_nr-1); 5463 adj->ref_nr--; 5464 return; 5465 } 5466 5467 if (adj->master) 5468 sysfs_remove_link(&(dev->dev.kobj), "master"); 5469 5470 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5471 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5472 5473 list_del_rcu(&adj->list); 5474 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5475 adj_dev->name, dev->name, adj_dev->name); 5476 dev_put(adj_dev); 5477 kfree_rcu(adj, rcu); 5478 } 5479 5480 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5481 struct net_device *upper_dev, 5482 struct list_head *up_list, 5483 struct list_head *down_list, 5484 void *private, bool master) 5485 { 5486 int ret; 5487 5488 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, 5489 master); 5490 if (ret) 5491 return ret; 5492 5493 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, 5494 false); 5495 if (ret) { 5496 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5497 return ret; 5498 } 5499 5500 return 0; 5501 } 5502 5503 static int __netdev_adjacent_dev_link(struct net_device *dev, 5504 struct net_device *upper_dev) 5505 { 5506 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 5507 &dev->all_adj_list.upper, 5508 &upper_dev->all_adj_list.lower, 5509 NULL, false); 5510 } 5511 5512 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5513 struct net_device *upper_dev, 5514 struct list_head *up_list, 5515 struct list_head *down_list) 5516 { 5517 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5518 __netdev_adjacent_dev_remove(upper_dev, dev, down_list); 5519 } 5520 5521 static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5522 struct net_device *upper_dev) 5523 { 5524 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5525 &dev->all_adj_list.upper, 5526 &upper_dev->all_adj_list.lower); 5527 } 5528 5529 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5530 struct net_device *upper_dev, 5531 void *private, bool master) 5532 { 5533 int ret = __netdev_adjacent_dev_link(dev, upper_dev); 5534 5535 if (ret) 5536 return ret; 5537 5538 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 5539 &dev->adj_list.upper, 5540 &upper_dev->adj_list.lower, 5541 private, master); 5542 if (ret) { 5543 __netdev_adjacent_dev_unlink(dev, upper_dev); 5544 return ret; 5545 } 5546 5547 return 0; 5548 } 5549 5550 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5551 struct net_device *upper_dev) 5552 { 5553 __netdev_adjacent_dev_unlink(dev, upper_dev); 5554 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5555 &dev->adj_list.upper, 5556 &upper_dev->adj_list.lower); 5557 } 5558 5559 static int __netdev_upper_dev_link(struct net_device *dev, 5560 struct net_device *upper_dev, bool master, 5561 void *upper_priv, void *upper_info) 5562 { 5563 struct netdev_notifier_changeupper_info changeupper_info; 5564 struct netdev_adjacent *i, *j, *to_i, *to_j; 5565 int ret = 0; 5566 5567 ASSERT_RTNL(); 5568 5569 if (dev == upper_dev) 5570 return -EBUSY; 5571 5572 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5573 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) 5574 return -EBUSY; 5575 5576 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) 5577 return -EEXIST; 5578 5579 if (master && netdev_master_upper_dev_get(dev)) 5580 return -EBUSY; 5581 5582 changeupper_info.upper_dev = upper_dev; 5583 changeupper_info.master = master; 5584 changeupper_info.linking = true; 5585 changeupper_info.upper_info = upper_info; 5586 5587 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5588 &changeupper_info.info); 5589 ret = notifier_to_errno(ret); 5590 if (ret) 5591 return ret; 5592 5593 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, 5594 master); 5595 if (ret) 5596 return ret; 5597 5598 /* Now that we linked these devs, make all the upper_dev's 5599 * all_adj_list.upper visible to every dev's all_adj_list.lower an 5600 * versa, and don't forget the devices itself. All of these 5601 * links are non-neighbours. 5602 */ 5603 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5604 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5605 pr_debug("Interlinking %s with %s, non-neighbour\n", 5606 i->dev->name, j->dev->name); 5607 ret = __netdev_adjacent_dev_link(i->dev, j->dev); 5608 if (ret) 5609 goto rollback_mesh; 5610 } 5611 } 5612 5613 /* add dev to every upper_dev's upper device */ 5614 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5615 pr_debug("linking %s's upper device %s with %s\n", 5616 upper_dev->name, i->dev->name, dev->name); 5617 ret = __netdev_adjacent_dev_link(dev, i->dev); 5618 if (ret) 5619 goto rollback_upper_mesh; 5620 } 5621 5622 /* add upper_dev to every dev's lower device */ 5623 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5624 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5625 i->dev->name, upper_dev->name); 5626 ret = __netdev_adjacent_dev_link(i->dev, upper_dev); 5627 if (ret) 5628 goto rollback_lower_mesh; 5629 } 5630 5631 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5632 &changeupper_info.info); 5633 ret = notifier_to_errno(ret); 5634 if (ret) 5635 goto rollback_lower_mesh; 5636 5637 return 0; 5638 5639 rollback_lower_mesh: 5640 to_i = i; 5641 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5642 if (i == to_i) 5643 break; 5644 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5645 } 5646 5647 i = NULL; 5648 5649 rollback_upper_mesh: 5650 to_i = i; 5651 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5652 if (i == to_i) 5653 break; 5654 __netdev_adjacent_dev_unlink(dev, i->dev); 5655 } 5656 5657 i = j = NULL; 5658 5659 rollback_mesh: 5660 to_i = i; 5661 to_j = j; 5662 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5663 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5664 if (i == to_i && j == to_j) 5665 break; 5666 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5667 } 5668 if (i == to_i) 5669 break; 5670 } 5671 5672 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5673 5674 return ret; 5675 } 5676 5677 /** 5678 * netdev_upper_dev_link - Add a link to the upper device 5679 * @dev: device 5680 * @upper_dev: new upper device 5681 * 5682 * Adds a link to device which is upper to this one. The caller must hold 5683 * the RTNL lock. On a failure a negative errno code is returned. 5684 * On success the reference counts are adjusted and the function 5685 * returns zero. 5686 */ 5687 int netdev_upper_dev_link(struct net_device *dev, 5688 struct net_device *upper_dev) 5689 { 5690 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); 5691 } 5692 EXPORT_SYMBOL(netdev_upper_dev_link); 5693 5694 /** 5695 * netdev_master_upper_dev_link - Add a master link to the upper device 5696 * @dev: device 5697 * @upper_dev: new upper device 5698 * @upper_priv: upper device private 5699 * @upper_info: upper info to be passed down via notifier 5700 * 5701 * Adds a link to device which is upper to this one. In this case, only 5702 * one master upper device can be linked, although other non-master devices 5703 * might be linked as well. The caller must hold the RTNL lock. 5704 * On a failure a negative errno code is returned. On success the reference 5705 * counts are adjusted and the function returns zero. 5706 */ 5707 int netdev_master_upper_dev_link(struct net_device *dev, 5708 struct net_device *upper_dev, 5709 void *upper_priv, void *upper_info) 5710 { 5711 return __netdev_upper_dev_link(dev, upper_dev, true, 5712 upper_priv, upper_info); 5713 } 5714 EXPORT_SYMBOL(netdev_master_upper_dev_link); 5715 5716 /** 5717 * netdev_upper_dev_unlink - Removes a link to upper device 5718 * @dev: device 5719 * @upper_dev: new upper device 5720 * 5721 * Removes a link to device which is upper to this one. The caller must hold 5722 * the RTNL lock. 5723 */ 5724 void netdev_upper_dev_unlink(struct net_device *dev, 5725 struct net_device *upper_dev) 5726 { 5727 struct netdev_notifier_changeupper_info changeupper_info; 5728 struct netdev_adjacent *i, *j; 5729 ASSERT_RTNL(); 5730 5731 changeupper_info.upper_dev = upper_dev; 5732 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; 5733 changeupper_info.linking = false; 5734 5735 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5736 &changeupper_info.info); 5737 5738 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5739 5740 /* Here is the tricky part. We must remove all dev's lower 5741 * devices from all upper_dev's upper devices and vice 5742 * versa, to maintain the graph relationship. 5743 */ 5744 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5745 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 5746 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5747 5748 /* remove also the devices itself from lower/upper device 5749 * list 5750 */ 5751 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5752 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5753 5754 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 5755 __netdev_adjacent_dev_unlink(dev, i->dev); 5756 5757 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5758 &changeupper_info.info); 5759 } 5760 EXPORT_SYMBOL(netdev_upper_dev_unlink); 5761 5762 /** 5763 * netdev_bonding_info_change - Dispatch event about slave change 5764 * @dev: device 5765 * @bonding_info: info to dispatch 5766 * 5767 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 5768 * The caller must hold the RTNL lock. 5769 */ 5770 void netdev_bonding_info_change(struct net_device *dev, 5771 struct netdev_bonding_info *bonding_info) 5772 { 5773 struct netdev_notifier_bonding_info info; 5774 5775 memcpy(&info.bonding_info, bonding_info, 5776 sizeof(struct netdev_bonding_info)); 5777 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 5778 &info.info); 5779 } 5780 EXPORT_SYMBOL(netdev_bonding_info_change); 5781 5782 static void netdev_adjacent_add_links(struct net_device *dev) 5783 { 5784 struct netdev_adjacent *iter; 5785 5786 struct net *net = dev_net(dev); 5787 5788 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5789 if (!net_eq(net,dev_net(iter->dev))) 5790 continue; 5791 netdev_adjacent_sysfs_add(iter->dev, dev, 5792 &iter->dev->adj_list.lower); 5793 netdev_adjacent_sysfs_add(dev, iter->dev, 5794 &dev->adj_list.upper); 5795 } 5796 5797 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5798 if (!net_eq(net,dev_net(iter->dev))) 5799 continue; 5800 netdev_adjacent_sysfs_add(iter->dev, dev, 5801 &iter->dev->adj_list.upper); 5802 netdev_adjacent_sysfs_add(dev, iter->dev, 5803 &dev->adj_list.lower); 5804 } 5805 } 5806 5807 static void netdev_adjacent_del_links(struct net_device *dev) 5808 { 5809 struct netdev_adjacent *iter; 5810 5811 struct net *net = dev_net(dev); 5812 5813 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5814 if (!net_eq(net,dev_net(iter->dev))) 5815 continue; 5816 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5817 &iter->dev->adj_list.lower); 5818 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5819 &dev->adj_list.upper); 5820 } 5821 5822 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5823 if (!net_eq(net,dev_net(iter->dev))) 5824 continue; 5825 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5826 &iter->dev->adj_list.upper); 5827 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5828 &dev->adj_list.lower); 5829 } 5830 } 5831 5832 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 5833 { 5834 struct netdev_adjacent *iter; 5835 5836 struct net *net = dev_net(dev); 5837 5838 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5839 if (!net_eq(net,dev_net(iter->dev))) 5840 continue; 5841 netdev_adjacent_sysfs_del(iter->dev, oldname, 5842 &iter->dev->adj_list.lower); 5843 netdev_adjacent_sysfs_add(iter->dev, dev, 5844 &iter->dev->adj_list.lower); 5845 } 5846 5847 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5848 if (!net_eq(net,dev_net(iter->dev))) 5849 continue; 5850 netdev_adjacent_sysfs_del(iter->dev, oldname, 5851 &iter->dev->adj_list.upper); 5852 netdev_adjacent_sysfs_add(iter->dev, dev, 5853 &iter->dev->adj_list.upper); 5854 } 5855 } 5856 5857 void *netdev_lower_dev_get_private(struct net_device *dev, 5858 struct net_device *lower_dev) 5859 { 5860 struct netdev_adjacent *lower; 5861 5862 if (!lower_dev) 5863 return NULL; 5864 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); 5865 if (!lower) 5866 return NULL; 5867 5868 return lower->private; 5869 } 5870 EXPORT_SYMBOL(netdev_lower_dev_get_private); 5871 5872 5873 int dev_get_nest_level(struct net_device *dev, 5874 bool (*type_check)(const struct net_device *dev)) 5875 { 5876 struct net_device *lower = NULL; 5877 struct list_head *iter; 5878 int max_nest = -1; 5879 int nest; 5880 5881 ASSERT_RTNL(); 5882 5883 netdev_for_each_lower_dev(dev, lower, iter) { 5884 nest = dev_get_nest_level(lower, type_check); 5885 if (max_nest < nest) 5886 max_nest = nest; 5887 } 5888 5889 if (type_check(dev)) 5890 max_nest++; 5891 5892 return max_nest; 5893 } 5894 EXPORT_SYMBOL(dev_get_nest_level); 5895 5896 /** 5897 * netdev_lower_change - Dispatch event about lower device state change 5898 * @lower_dev: device 5899 * @lower_state_info: state to dispatch 5900 * 5901 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. 5902 * The caller must hold the RTNL lock. 5903 */ 5904 void netdev_lower_state_changed(struct net_device *lower_dev, 5905 void *lower_state_info) 5906 { 5907 struct netdev_notifier_changelowerstate_info changelowerstate_info; 5908 5909 ASSERT_RTNL(); 5910 changelowerstate_info.lower_state_info = lower_state_info; 5911 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, 5912 &changelowerstate_info.info); 5913 } 5914 EXPORT_SYMBOL(netdev_lower_state_changed); 5915 5916 static void dev_change_rx_flags(struct net_device *dev, int flags) 5917 { 5918 const struct net_device_ops *ops = dev->netdev_ops; 5919 5920 if (ops->ndo_change_rx_flags) 5921 ops->ndo_change_rx_flags(dev, flags); 5922 } 5923 5924 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 5925 { 5926 unsigned int old_flags = dev->flags; 5927 kuid_t uid; 5928 kgid_t gid; 5929 5930 ASSERT_RTNL(); 5931 5932 dev->flags |= IFF_PROMISC; 5933 dev->promiscuity += inc; 5934 if (dev->promiscuity == 0) { 5935 /* 5936 * Avoid overflow. 5937 * If inc causes overflow, untouch promisc and return error. 5938 */ 5939 if (inc < 0) 5940 dev->flags &= ~IFF_PROMISC; 5941 else { 5942 dev->promiscuity -= inc; 5943 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 5944 dev->name); 5945 return -EOVERFLOW; 5946 } 5947 } 5948 if (dev->flags != old_flags) { 5949 pr_info("device %s %s promiscuous mode\n", 5950 dev->name, 5951 dev->flags & IFF_PROMISC ? "entered" : "left"); 5952 if (audit_enabled) { 5953 current_uid_gid(&uid, &gid); 5954 audit_log(current->audit_context, GFP_ATOMIC, 5955 AUDIT_ANOM_PROMISCUOUS, 5956 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 5957 dev->name, (dev->flags & IFF_PROMISC), 5958 (old_flags & IFF_PROMISC), 5959 from_kuid(&init_user_ns, audit_get_loginuid(current)), 5960 from_kuid(&init_user_ns, uid), 5961 from_kgid(&init_user_ns, gid), 5962 audit_get_sessionid(current)); 5963 } 5964 5965 dev_change_rx_flags(dev, IFF_PROMISC); 5966 } 5967 if (notify) 5968 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 5969 return 0; 5970 } 5971 5972 /** 5973 * dev_set_promiscuity - update promiscuity count on a device 5974 * @dev: device 5975 * @inc: modifier 5976 * 5977 * Add or remove promiscuity from a device. While the count in the device 5978 * remains above zero the interface remains promiscuous. Once it hits zero 5979 * the device reverts back to normal filtering operation. A negative inc 5980 * value is used to drop promiscuity on the device. 5981 * Return 0 if successful or a negative errno code on error. 5982 */ 5983 int dev_set_promiscuity(struct net_device *dev, int inc) 5984 { 5985 unsigned int old_flags = dev->flags; 5986 int err; 5987 5988 err = __dev_set_promiscuity(dev, inc, true); 5989 if (err < 0) 5990 return err; 5991 if (dev->flags != old_flags) 5992 dev_set_rx_mode(dev); 5993 return err; 5994 } 5995 EXPORT_SYMBOL(dev_set_promiscuity); 5996 5997 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 5998 { 5999 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 6000 6001 ASSERT_RTNL(); 6002 6003 dev->flags |= IFF_ALLMULTI; 6004 dev->allmulti += inc; 6005 if (dev->allmulti == 0) { 6006 /* 6007 * Avoid overflow. 6008 * If inc causes overflow, untouch allmulti and return error. 6009 */ 6010 if (inc < 0) 6011 dev->flags &= ~IFF_ALLMULTI; 6012 else { 6013 dev->allmulti -= inc; 6014 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 6015 dev->name); 6016 return -EOVERFLOW; 6017 } 6018 } 6019 if (dev->flags ^ old_flags) { 6020 dev_change_rx_flags(dev, IFF_ALLMULTI); 6021 dev_set_rx_mode(dev); 6022 if (notify) 6023 __dev_notify_flags(dev, old_flags, 6024 dev->gflags ^ old_gflags); 6025 } 6026 return 0; 6027 } 6028 6029 /** 6030 * dev_set_allmulti - update allmulti count on a device 6031 * @dev: device 6032 * @inc: modifier 6033 * 6034 * Add or remove reception of all multicast frames to a device. While the 6035 * count in the device remains above zero the interface remains listening 6036 * to all interfaces. Once it hits zero the device reverts back to normal 6037 * filtering operation. A negative @inc value is used to drop the counter 6038 * when releasing a resource needing all multicasts. 6039 * Return 0 if successful or a negative errno code on error. 6040 */ 6041 6042 int dev_set_allmulti(struct net_device *dev, int inc) 6043 { 6044 return __dev_set_allmulti(dev, inc, true); 6045 } 6046 EXPORT_SYMBOL(dev_set_allmulti); 6047 6048 /* 6049 * Upload unicast and multicast address lists to device and 6050 * configure RX filtering. When the device doesn't support unicast 6051 * filtering it is put in promiscuous mode while unicast addresses 6052 * are present. 6053 */ 6054 void __dev_set_rx_mode(struct net_device *dev) 6055 { 6056 const struct net_device_ops *ops = dev->netdev_ops; 6057 6058 /* dev_open will call this function so the list will stay sane. */ 6059 if (!(dev->flags&IFF_UP)) 6060 return; 6061 6062 if (!netif_device_present(dev)) 6063 return; 6064 6065 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 6066 /* Unicast addresses changes may only happen under the rtnl, 6067 * therefore calling __dev_set_promiscuity here is safe. 6068 */ 6069 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 6070 __dev_set_promiscuity(dev, 1, false); 6071 dev->uc_promisc = true; 6072 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 6073 __dev_set_promiscuity(dev, -1, false); 6074 dev->uc_promisc = false; 6075 } 6076 } 6077 6078 if (ops->ndo_set_rx_mode) 6079 ops->ndo_set_rx_mode(dev); 6080 } 6081 6082 void dev_set_rx_mode(struct net_device *dev) 6083 { 6084 netif_addr_lock_bh(dev); 6085 __dev_set_rx_mode(dev); 6086 netif_addr_unlock_bh(dev); 6087 } 6088 6089 /** 6090 * dev_get_flags - get flags reported to userspace 6091 * @dev: device 6092 * 6093 * Get the combination of flag bits exported through APIs to userspace. 6094 */ 6095 unsigned int dev_get_flags(const struct net_device *dev) 6096 { 6097 unsigned int flags; 6098 6099 flags = (dev->flags & ~(IFF_PROMISC | 6100 IFF_ALLMULTI | 6101 IFF_RUNNING | 6102 IFF_LOWER_UP | 6103 IFF_DORMANT)) | 6104 (dev->gflags & (IFF_PROMISC | 6105 IFF_ALLMULTI)); 6106 6107 if (netif_running(dev)) { 6108 if (netif_oper_up(dev)) 6109 flags |= IFF_RUNNING; 6110 if (netif_carrier_ok(dev)) 6111 flags |= IFF_LOWER_UP; 6112 if (netif_dormant(dev)) 6113 flags |= IFF_DORMANT; 6114 } 6115 6116 return flags; 6117 } 6118 EXPORT_SYMBOL(dev_get_flags); 6119 6120 int __dev_change_flags(struct net_device *dev, unsigned int flags) 6121 { 6122 unsigned int old_flags = dev->flags; 6123 int ret; 6124 6125 ASSERT_RTNL(); 6126 6127 /* 6128 * Set the flags on our device. 6129 */ 6130 6131 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 6132 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 6133 IFF_AUTOMEDIA)) | 6134 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 6135 IFF_ALLMULTI)); 6136 6137 /* 6138 * Load in the correct multicast list now the flags have changed. 6139 */ 6140 6141 if ((old_flags ^ flags) & IFF_MULTICAST) 6142 dev_change_rx_flags(dev, IFF_MULTICAST); 6143 6144 dev_set_rx_mode(dev); 6145 6146 /* 6147 * Have we downed the interface. We handle IFF_UP ourselves 6148 * according to user attempts to set it, rather than blindly 6149 * setting it. 6150 */ 6151 6152 ret = 0; 6153 if ((old_flags ^ flags) & IFF_UP) 6154 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 6155 6156 if ((flags ^ dev->gflags) & IFF_PROMISC) { 6157 int inc = (flags & IFF_PROMISC) ? 1 : -1; 6158 unsigned int old_flags = dev->flags; 6159 6160 dev->gflags ^= IFF_PROMISC; 6161 6162 if (__dev_set_promiscuity(dev, inc, false) >= 0) 6163 if (dev->flags != old_flags) 6164 dev_set_rx_mode(dev); 6165 } 6166 6167 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 6168 is important. Some (broken) drivers set IFF_PROMISC, when 6169 IFF_ALLMULTI is requested not asking us and not reporting. 6170 */ 6171 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 6172 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 6173 6174 dev->gflags ^= IFF_ALLMULTI; 6175 __dev_set_allmulti(dev, inc, false); 6176 } 6177 6178 return ret; 6179 } 6180 6181 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 6182 unsigned int gchanges) 6183 { 6184 unsigned int changes = dev->flags ^ old_flags; 6185 6186 if (gchanges) 6187 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 6188 6189 if (changes & IFF_UP) { 6190 if (dev->flags & IFF_UP) 6191 call_netdevice_notifiers(NETDEV_UP, dev); 6192 else 6193 call_netdevice_notifiers(NETDEV_DOWN, dev); 6194 } 6195 6196 if (dev->flags & IFF_UP && 6197 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 6198 struct netdev_notifier_change_info change_info; 6199 6200 change_info.flags_changed = changes; 6201 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 6202 &change_info.info); 6203 } 6204 } 6205 6206 /** 6207 * dev_change_flags - change device settings 6208 * @dev: device 6209 * @flags: device state flags 6210 * 6211 * Change settings on device based state flags. The flags are 6212 * in the userspace exported format. 6213 */ 6214 int dev_change_flags(struct net_device *dev, unsigned int flags) 6215 { 6216 int ret; 6217 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 6218 6219 ret = __dev_change_flags(dev, flags); 6220 if (ret < 0) 6221 return ret; 6222 6223 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 6224 __dev_notify_flags(dev, old_flags, changes); 6225 return ret; 6226 } 6227 EXPORT_SYMBOL(dev_change_flags); 6228 6229 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 6230 { 6231 const struct net_device_ops *ops = dev->netdev_ops; 6232 6233 if (ops->ndo_change_mtu) 6234 return ops->ndo_change_mtu(dev, new_mtu); 6235 6236 dev->mtu = new_mtu; 6237 return 0; 6238 } 6239 6240 /** 6241 * dev_set_mtu - Change maximum transfer unit 6242 * @dev: device 6243 * @new_mtu: new transfer unit 6244 * 6245 * Change the maximum transfer size of the network device. 6246 */ 6247 int dev_set_mtu(struct net_device *dev, int new_mtu) 6248 { 6249 int err, orig_mtu; 6250 6251 if (new_mtu == dev->mtu) 6252 return 0; 6253 6254 /* MTU must be positive. */ 6255 if (new_mtu < 0) 6256 return -EINVAL; 6257 6258 if (!netif_device_present(dev)) 6259 return -ENODEV; 6260 6261 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 6262 err = notifier_to_errno(err); 6263 if (err) 6264 return err; 6265 6266 orig_mtu = dev->mtu; 6267 err = __dev_set_mtu(dev, new_mtu); 6268 6269 if (!err) { 6270 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6271 err = notifier_to_errno(err); 6272 if (err) { 6273 /* setting mtu back and notifying everyone again, 6274 * so that they have a chance to revert changes. 6275 */ 6276 __dev_set_mtu(dev, orig_mtu); 6277 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6278 } 6279 } 6280 return err; 6281 } 6282 EXPORT_SYMBOL(dev_set_mtu); 6283 6284 /** 6285 * dev_set_group - Change group this device belongs to 6286 * @dev: device 6287 * @new_group: group this device should belong to 6288 */ 6289 void dev_set_group(struct net_device *dev, int new_group) 6290 { 6291 dev->group = new_group; 6292 } 6293 EXPORT_SYMBOL(dev_set_group); 6294 6295 /** 6296 * dev_set_mac_address - Change Media Access Control Address 6297 * @dev: device 6298 * @sa: new address 6299 * 6300 * Change the hardware (MAC) address of the device 6301 */ 6302 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 6303 { 6304 const struct net_device_ops *ops = dev->netdev_ops; 6305 int err; 6306 6307 if (!ops->ndo_set_mac_address) 6308 return -EOPNOTSUPP; 6309 if (sa->sa_family != dev->type) 6310 return -EINVAL; 6311 if (!netif_device_present(dev)) 6312 return -ENODEV; 6313 err = ops->ndo_set_mac_address(dev, sa); 6314 if (err) 6315 return err; 6316 dev->addr_assign_type = NET_ADDR_SET; 6317 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 6318 add_device_randomness(dev->dev_addr, dev->addr_len); 6319 return 0; 6320 } 6321 EXPORT_SYMBOL(dev_set_mac_address); 6322 6323 /** 6324 * dev_change_carrier - Change device carrier 6325 * @dev: device 6326 * @new_carrier: new value 6327 * 6328 * Change device carrier 6329 */ 6330 int dev_change_carrier(struct net_device *dev, bool new_carrier) 6331 { 6332 const struct net_device_ops *ops = dev->netdev_ops; 6333 6334 if (!ops->ndo_change_carrier) 6335 return -EOPNOTSUPP; 6336 if (!netif_device_present(dev)) 6337 return -ENODEV; 6338 return ops->ndo_change_carrier(dev, new_carrier); 6339 } 6340 EXPORT_SYMBOL(dev_change_carrier); 6341 6342 /** 6343 * dev_get_phys_port_id - Get device physical port ID 6344 * @dev: device 6345 * @ppid: port ID 6346 * 6347 * Get device physical port ID 6348 */ 6349 int dev_get_phys_port_id(struct net_device *dev, 6350 struct netdev_phys_item_id *ppid) 6351 { 6352 const struct net_device_ops *ops = dev->netdev_ops; 6353 6354 if (!ops->ndo_get_phys_port_id) 6355 return -EOPNOTSUPP; 6356 return ops->ndo_get_phys_port_id(dev, ppid); 6357 } 6358 EXPORT_SYMBOL(dev_get_phys_port_id); 6359 6360 /** 6361 * dev_get_phys_port_name - Get device physical port name 6362 * @dev: device 6363 * @name: port name 6364 * 6365 * Get device physical port name 6366 */ 6367 int dev_get_phys_port_name(struct net_device *dev, 6368 char *name, size_t len) 6369 { 6370 const struct net_device_ops *ops = dev->netdev_ops; 6371 6372 if (!ops->ndo_get_phys_port_name) 6373 return -EOPNOTSUPP; 6374 return ops->ndo_get_phys_port_name(dev, name, len); 6375 } 6376 EXPORT_SYMBOL(dev_get_phys_port_name); 6377 6378 /** 6379 * dev_change_proto_down - update protocol port state information 6380 * @dev: device 6381 * @proto_down: new value 6382 * 6383 * This info can be used by switch drivers to set the phys state of the 6384 * port. 6385 */ 6386 int dev_change_proto_down(struct net_device *dev, bool proto_down) 6387 { 6388 const struct net_device_ops *ops = dev->netdev_ops; 6389 6390 if (!ops->ndo_change_proto_down) 6391 return -EOPNOTSUPP; 6392 if (!netif_device_present(dev)) 6393 return -ENODEV; 6394 return ops->ndo_change_proto_down(dev, proto_down); 6395 } 6396 EXPORT_SYMBOL(dev_change_proto_down); 6397 6398 /** 6399 * dev_new_index - allocate an ifindex 6400 * @net: the applicable net namespace 6401 * 6402 * Returns a suitable unique value for a new device interface 6403 * number. The caller must hold the rtnl semaphore or the 6404 * dev_base_lock to be sure it remains unique. 6405 */ 6406 static int dev_new_index(struct net *net) 6407 { 6408 int ifindex = net->ifindex; 6409 for (;;) { 6410 if (++ifindex <= 0) 6411 ifindex = 1; 6412 if (!__dev_get_by_index(net, ifindex)) 6413 return net->ifindex = ifindex; 6414 } 6415 } 6416 6417 /* Delayed registration/unregisteration */ 6418 static LIST_HEAD(net_todo_list); 6419 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 6420 6421 static void net_set_todo(struct net_device *dev) 6422 { 6423 list_add_tail(&dev->todo_list, &net_todo_list); 6424 dev_net(dev)->dev_unreg_count++; 6425 } 6426 6427 static void rollback_registered_many(struct list_head *head) 6428 { 6429 struct net_device *dev, *tmp; 6430 LIST_HEAD(close_head); 6431 6432 BUG_ON(dev_boot_phase); 6433 ASSERT_RTNL(); 6434 6435 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 6436 /* Some devices call without registering 6437 * for initialization unwind. Remove those 6438 * devices and proceed with the remaining. 6439 */ 6440 if (dev->reg_state == NETREG_UNINITIALIZED) { 6441 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 6442 dev->name, dev); 6443 6444 WARN_ON(1); 6445 list_del(&dev->unreg_list); 6446 continue; 6447 } 6448 dev->dismantle = true; 6449 BUG_ON(dev->reg_state != NETREG_REGISTERED); 6450 } 6451 6452 /* If device is running, close it first. */ 6453 list_for_each_entry(dev, head, unreg_list) 6454 list_add_tail(&dev->close_list, &close_head); 6455 dev_close_many(&close_head, true); 6456 6457 list_for_each_entry(dev, head, unreg_list) { 6458 /* And unlink it from device chain. */ 6459 unlist_netdevice(dev); 6460 6461 dev->reg_state = NETREG_UNREGISTERING; 6462 on_each_cpu(flush_backlog, dev, 1); 6463 } 6464 6465 synchronize_net(); 6466 6467 list_for_each_entry(dev, head, unreg_list) { 6468 struct sk_buff *skb = NULL; 6469 6470 /* Shutdown queueing discipline. */ 6471 dev_shutdown(dev); 6472 6473 6474 /* Notify protocols, that we are about to destroy 6475 this device. They should clean all the things. 6476 */ 6477 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6478 6479 if (!dev->rtnl_link_ops || 6480 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6481 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 6482 GFP_KERNEL); 6483 6484 /* 6485 * Flush the unicast and multicast chains 6486 */ 6487 dev_uc_flush(dev); 6488 dev_mc_flush(dev); 6489 6490 if (dev->netdev_ops->ndo_uninit) 6491 dev->netdev_ops->ndo_uninit(dev); 6492 6493 if (skb) 6494 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 6495 6496 /* Notifier chain MUST detach us all upper devices. */ 6497 WARN_ON(netdev_has_any_upper_dev(dev)); 6498 6499 /* Remove entries from kobject tree */ 6500 netdev_unregister_kobject(dev); 6501 #ifdef CONFIG_XPS 6502 /* Remove XPS queueing entries */ 6503 netif_reset_xps_queues_gt(dev, 0); 6504 #endif 6505 } 6506 6507 synchronize_net(); 6508 6509 list_for_each_entry(dev, head, unreg_list) 6510 dev_put(dev); 6511 } 6512 6513 static void rollback_registered(struct net_device *dev) 6514 { 6515 LIST_HEAD(single); 6516 6517 list_add(&dev->unreg_list, &single); 6518 rollback_registered_many(&single); 6519 list_del(&single); 6520 } 6521 6522 static netdev_features_t netdev_sync_upper_features(struct net_device *lower, 6523 struct net_device *upper, netdev_features_t features) 6524 { 6525 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 6526 netdev_features_t feature; 6527 int feature_bit; 6528 6529 for_each_netdev_feature(&upper_disables, feature_bit) { 6530 feature = __NETIF_F_BIT(feature_bit); 6531 if (!(upper->wanted_features & feature) 6532 && (features & feature)) { 6533 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", 6534 &feature, upper->name); 6535 features &= ~feature; 6536 } 6537 } 6538 6539 return features; 6540 } 6541 6542 static void netdev_sync_lower_features(struct net_device *upper, 6543 struct net_device *lower, netdev_features_t features) 6544 { 6545 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 6546 netdev_features_t feature; 6547 int feature_bit; 6548 6549 for_each_netdev_feature(&upper_disables, feature_bit) { 6550 feature = __NETIF_F_BIT(feature_bit); 6551 if (!(features & feature) && (lower->features & feature)) { 6552 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", 6553 &feature, lower->name); 6554 lower->wanted_features &= ~feature; 6555 netdev_update_features(lower); 6556 6557 if (unlikely(lower->features & feature)) 6558 netdev_WARN(upper, "failed to disable %pNF on %s!\n", 6559 &feature, lower->name); 6560 } 6561 } 6562 } 6563 6564 static netdev_features_t netdev_fix_features(struct net_device *dev, 6565 netdev_features_t features) 6566 { 6567 /* Fix illegal checksum combinations */ 6568 if ((features & NETIF_F_HW_CSUM) && 6569 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6570 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 6571 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 6572 } 6573 6574 /* TSO requires that SG is present as well. */ 6575 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 6576 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 6577 features &= ~NETIF_F_ALL_TSO; 6578 } 6579 6580 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 6581 !(features & NETIF_F_IP_CSUM)) { 6582 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 6583 features &= ~NETIF_F_TSO; 6584 features &= ~NETIF_F_TSO_ECN; 6585 } 6586 6587 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 6588 !(features & NETIF_F_IPV6_CSUM)) { 6589 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 6590 features &= ~NETIF_F_TSO6; 6591 } 6592 6593 /* TSO ECN requires that TSO is present as well. */ 6594 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 6595 features &= ~NETIF_F_TSO_ECN; 6596 6597 /* Software GSO depends on SG. */ 6598 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 6599 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 6600 features &= ~NETIF_F_GSO; 6601 } 6602 6603 /* UFO needs SG and checksumming */ 6604 if (features & NETIF_F_UFO) { 6605 /* maybe split UFO into V4 and V6? */ 6606 if (!(features & NETIF_F_HW_CSUM) && 6607 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != 6608 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { 6609 netdev_dbg(dev, 6610 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 6611 features &= ~NETIF_F_UFO; 6612 } 6613 6614 if (!(features & NETIF_F_SG)) { 6615 netdev_dbg(dev, 6616 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 6617 features &= ~NETIF_F_UFO; 6618 } 6619 } 6620 6621 #ifdef CONFIG_NET_RX_BUSY_POLL 6622 if (dev->netdev_ops->ndo_busy_poll) 6623 features |= NETIF_F_BUSY_POLL; 6624 else 6625 #endif 6626 features &= ~NETIF_F_BUSY_POLL; 6627 6628 return features; 6629 } 6630 6631 int __netdev_update_features(struct net_device *dev) 6632 { 6633 struct net_device *upper, *lower; 6634 netdev_features_t features; 6635 struct list_head *iter; 6636 int err = -1; 6637 6638 ASSERT_RTNL(); 6639 6640 features = netdev_get_wanted_features(dev); 6641 6642 if (dev->netdev_ops->ndo_fix_features) 6643 features = dev->netdev_ops->ndo_fix_features(dev, features); 6644 6645 /* driver might be less strict about feature dependencies */ 6646 features = netdev_fix_features(dev, features); 6647 6648 /* some features can't be enabled if they're off an an upper device */ 6649 netdev_for_each_upper_dev_rcu(dev, upper, iter) 6650 features = netdev_sync_upper_features(dev, upper, features); 6651 6652 if (dev->features == features) 6653 goto sync_lower; 6654 6655 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 6656 &dev->features, &features); 6657 6658 if (dev->netdev_ops->ndo_set_features) 6659 err = dev->netdev_ops->ndo_set_features(dev, features); 6660 else 6661 err = 0; 6662 6663 if (unlikely(err < 0)) { 6664 netdev_err(dev, 6665 "set_features() failed (%d); wanted %pNF, left %pNF\n", 6666 err, &features, &dev->features); 6667 /* return non-0 since some features might have changed and 6668 * it's better to fire a spurious notification than miss it 6669 */ 6670 return -1; 6671 } 6672 6673 sync_lower: 6674 /* some features must be disabled on lower devices when disabled 6675 * on an upper device (think: bonding master or bridge) 6676 */ 6677 netdev_for_each_lower_dev(dev, lower, iter) 6678 netdev_sync_lower_features(dev, lower, features); 6679 6680 if (!err) 6681 dev->features = features; 6682 6683 return err < 0 ? 0 : 1; 6684 } 6685 6686 /** 6687 * netdev_update_features - recalculate device features 6688 * @dev: the device to check 6689 * 6690 * Recalculate dev->features set and send notifications if it 6691 * has changed. Should be called after driver or hardware dependent 6692 * conditions might have changed that influence the features. 6693 */ 6694 void netdev_update_features(struct net_device *dev) 6695 { 6696 if (__netdev_update_features(dev)) 6697 netdev_features_change(dev); 6698 } 6699 EXPORT_SYMBOL(netdev_update_features); 6700 6701 /** 6702 * netdev_change_features - recalculate device features 6703 * @dev: the device to check 6704 * 6705 * Recalculate dev->features set and send notifications even 6706 * if they have not changed. Should be called instead of 6707 * netdev_update_features() if also dev->vlan_features might 6708 * have changed to allow the changes to be propagated to stacked 6709 * VLAN devices. 6710 */ 6711 void netdev_change_features(struct net_device *dev) 6712 { 6713 __netdev_update_features(dev); 6714 netdev_features_change(dev); 6715 } 6716 EXPORT_SYMBOL(netdev_change_features); 6717 6718 /** 6719 * netif_stacked_transfer_operstate - transfer operstate 6720 * @rootdev: the root or lower level device to transfer state from 6721 * @dev: the device to transfer operstate to 6722 * 6723 * Transfer operational state from root to device. This is normally 6724 * called when a stacking relationship exists between the root 6725 * device and the device(a leaf device). 6726 */ 6727 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 6728 struct net_device *dev) 6729 { 6730 if (rootdev->operstate == IF_OPER_DORMANT) 6731 netif_dormant_on(dev); 6732 else 6733 netif_dormant_off(dev); 6734 6735 if (netif_carrier_ok(rootdev)) { 6736 if (!netif_carrier_ok(dev)) 6737 netif_carrier_on(dev); 6738 } else { 6739 if (netif_carrier_ok(dev)) 6740 netif_carrier_off(dev); 6741 } 6742 } 6743 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 6744 6745 #ifdef CONFIG_SYSFS 6746 static int netif_alloc_rx_queues(struct net_device *dev) 6747 { 6748 unsigned int i, count = dev->num_rx_queues; 6749 struct netdev_rx_queue *rx; 6750 size_t sz = count * sizeof(*rx); 6751 6752 BUG_ON(count < 1); 6753 6754 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6755 if (!rx) { 6756 rx = vzalloc(sz); 6757 if (!rx) 6758 return -ENOMEM; 6759 } 6760 dev->_rx = rx; 6761 6762 for (i = 0; i < count; i++) 6763 rx[i].dev = dev; 6764 return 0; 6765 } 6766 #endif 6767 6768 static void netdev_init_one_queue(struct net_device *dev, 6769 struct netdev_queue *queue, void *_unused) 6770 { 6771 /* Initialize queue lock */ 6772 spin_lock_init(&queue->_xmit_lock); 6773 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 6774 queue->xmit_lock_owner = -1; 6775 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 6776 queue->dev = dev; 6777 #ifdef CONFIG_BQL 6778 dql_init(&queue->dql, HZ); 6779 #endif 6780 } 6781 6782 static void netif_free_tx_queues(struct net_device *dev) 6783 { 6784 kvfree(dev->_tx); 6785 } 6786 6787 static int netif_alloc_netdev_queues(struct net_device *dev) 6788 { 6789 unsigned int count = dev->num_tx_queues; 6790 struct netdev_queue *tx; 6791 size_t sz = count * sizeof(*tx); 6792 6793 if (count < 1 || count > 0xffff) 6794 return -EINVAL; 6795 6796 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6797 if (!tx) { 6798 tx = vzalloc(sz); 6799 if (!tx) 6800 return -ENOMEM; 6801 } 6802 dev->_tx = tx; 6803 6804 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 6805 spin_lock_init(&dev->tx_global_lock); 6806 6807 return 0; 6808 } 6809 6810 void netif_tx_stop_all_queues(struct net_device *dev) 6811 { 6812 unsigned int i; 6813 6814 for (i = 0; i < dev->num_tx_queues; i++) { 6815 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 6816 netif_tx_stop_queue(txq); 6817 } 6818 } 6819 EXPORT_SYMBOL(netif_tx_stop_all_queues); 6820 6821 /** 6822 * register_netdevice - register a network device 6823 * @dev: device to register 6824 * 6825 * Take a completed network device structure and add it to the kernel 6826 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6827 * chain. 0 is returned on success. A negative errno code is returned 6828 * on a failure to set up the device, or if the name is a duplicate. 6829 * 6830 * Callers must hold the rtnl semaphore. You may want 6831 * register_netdev() instead of this. 6832 * 6833 * BUGS: 6834 * The locking appears insufficient to guarantee two parallel registers 6835 * will not get the same name. 6836 */ 6837 6838 int register_netdevice(struct net_device *dev) 6839 { 6840 int ret; 6841 struct net *net = dev_net(dev); 6842 6843 BUG_ON(dev_boot_phase); 6844 ASSERT_RTNL(); 6845 6846 might_sleep(); 6847 6848 /* When net_device's are persistent, this will be fatal. */ 6849 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 6850 BUG_ON(!net); 6851 6852 spin_lock_init(&dev->addr_list_lock); 6853 netdev_set_addr_lockdep_class(dev); 6854 6855 ret = dev_get_valid_name(net, dev, dev->name); 6856 if (ret < 0) 6857 goto out; 6858 6859 /* Init, if this function is available */ 6860 if (dev->netdev_ops->ndo_init) { 6861 ret = dev->netdev_ops->ndo_init(dev); 6862 if (ret) { 6863 if (ret > 0) 6864 ret = -EIO; 6865 goto out; 6866 } 6867 } 6868 6869 if (((dev->hw_features | dev->features) & 6870 NETIF_F_HW_VLAN_CTAG_FILTER) && 6871 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 6872 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 6873 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 6874 ret = -EINVAL; 6875 goto err_uninit; 6876 } 6877 6878 ret = -EBUSY; 6879 if (!dev->ifindex) 6880 dev->ifindex = dev_new_index(net); 6881 else if (__dev_get_by_index(net, dev->ifindex)) 6882 goto err_uninit; 6883 6884 /* Transfer changeable features to wanted_features and enable 6885 * software offloads (GSO and GRO). 6886 */ 6887 dev->hw_features |= NETIF_F_SOFT_FEATURES; 6888 dev->features |= NETIF_F_SOFT_FEATURES; 6889 dev->wanted_features = dev->features & dev->hw_features; 6890 6891 if (!(dev->flags & IFF_LOOPBACK)) { 6892 dev->hw_features |= NETIF_F_NOCACHE_COPY; 6893 } 6894 6895 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 6896 */ 6897 dev->vlan_features |= NETIF_F_HIGHDMA; 6898 6899 /* Make NETIF_F_SG inheritable to tunnel devices. 6900 */ 6901 dev->hw_enc_features |= NETIF_F_SG; 6902 6903 /* Make NETIF_F_SG inheritable to MPLS. 6904 */ 6905 dev->mpls_features |= NETIF_F_SG; 6906 6907 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 6908 ret = notifier_to_errno(ret); 6909 if (ret) 6910 goto err_uninit; 6911 6912 ret = netdev_register_kobject(dev); 6913 if (ret) 6914 goto err_uninit; 6915 dev->reg_state = NETREG_REGISTERED; 6916 6917 __netdev_update_features(dev); 6918 6919 /* 6920 * Default initial state at registry is that the 6921 * device is present. 6922 */ 6923 6924 set_bit(__LINK_STATE_PRESENT, &dev->state); 6925 6926 linkwatch_init_dev(dev); 6927 6928 dev_init_scheduler(dev); 6929 dev_hold(dev); 6930 list_netdevice(dev); 6931 add_device_randomness(dev->dev_addr, dev->addr_len); 6932 6933 /* If the device has permanent device address, driver should 6934 * set dev_addr and also addr_assign_type should be set to 6935 * NET_ADDR_PERM (default value). 6936 */ 6937 if (dev->addr_assign_type == NET_ADDR_PERM) 6938 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 6939 6940 /* Notify protocols, that a new device appeared. */ 6941 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 6942 ret = notifier_to_errno(ret); 6943 if (ret) { 6944 rollback_registered(dev); 6945 dev->reg_state = NETREG_UNREGISTERED; 6946 } 6947 /* 6948 * Prevent userspace races by waiting until the network 6949 * device is fully setup before sending notifications. 6950 */ 6951 if (!dev->rtnl_link_ops || 6952 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6953 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 6954 6955 out: 6956 return ret; 6957 6958 err_uninit: 6959 if (dev->netdev_ops->ndo_uninit) 6960 dev->netdev_ops->ndo_uninit(dev); 6961 goto out; 6962 } 6963 EXPORT_SYMBOL(register_netdevice); 6964 6965 /** 6966 * init_dummy_netdev - init a dummy network device for NAPI 6967 * @dev: device to init 6968 * 6969 * This takes a network device structure and initialize the minimum 6970 * amount of fields so it can be used to schedule NAPI polls without 6971 * registering a full blown interface. This is to be used by drivers 6972 * that need to tie several hardware interfaces to a single NAPI 6973 * poll scheduler due to HW limitations. 6974 */ 6975 int init_dummy_netdev(struct net_device *dev) 6976 { 6977 /* Clear everything. Note we don't initialize spinlocks 6978 * are they aren't supposed to be taken by any of the 6979 * NAPI code and this dummy netdev is supposed to be 6980 * only ever used for NAPI polls 6981 */ 6982 memset(dev, 0, sizeof(struct net_device)); 6983 6984 /* make sure we BUG if trying to hit standard 6985 * register/unregister code path 6986 */ 6987 dev->reg_state = NETREG_DUMMY; 6988 6989 /* NAPI wants this */ 6990 INIT_LIST_HEAD(&dev->napi_list); 6991 6992 /* a dummy interface is started by default */ 6993 set_bit(__LINK_STATE_PRESENT, &dev->state); 6994 set_bit(__LINK_STATE_START, &dev->state); 6995 6996 /* Note : We dont allocate pcpu_refcnt for dummy devices, 6997 * because users of this 'device' dont need to change 6998 * its refcount. 6999 */ 7000 7001 return 0; 7002 } 7003 EXPORT_SYMBOL_GPL(init_dummy_netdev); 7004 7005 7006 /** 7007 * register_netdev - register a network device 7008 * @dev: device to register 7009 * 7010 * Take a completed network device structure and add it to the kernel 7011 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 7012 * chain. 0 is returned on success. A negative errno code is returned 7013 * on a failure to set up the device, or if the name is a duplicate. 7014 * 7015 * This is a wrapper around register_netdevice that takes the rtnl semaphore 7016 * and expands the device name if you passed a format string to 7017 * alloc_netdev. 7018 */ 7019 int register_netdev(struct net_device *dev) 7020 { 7021 int err; 7022 7023 rtnl_lock(); 7024 err = register_netdevice(dev); 7025 rtnl_unlock(); 7026 return err; 7027 } 7028 EXPORT_SYMBOL(register_netdev); 7029 7030 int netdev_refcnt_read(const struct net_device *dev) 7031 { 7032 int i, refcnt = 0; 7033 7034 for_each_possible_cpu(i) 7035 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 7036 return refcnt; 7037 } 7038 EXPORT_SYMBOL(netdev_refcnt_read); 7039 7040 /** 7041 * netdev_wait_allrefs - wait until all references are gone. 7042 * @dev: target net_device 7043 * 7044 * This is called when unregistering network devices. 7045 * 7046 * Any protocol or device that holds a reference should register 7047 * for netdevice notification, and cleanup and put back the 7048 * reference if they receive an UNREGISTER event. 7049 * We can get stuck here if buggy protocols don't correctly 7050 * call dev_put. 7051 */ 7052 static void netdev_wait_allrefs(struct net_device *dev) 7053 { 7054 unsigned long rebroadcast_time, warning_time; 7055 int refcnt; 7056 7057 linkwatch_forget_dev(dev); 7058 7059 rebroadcast_time = warning_time = jiffies; 7060 refcnt = netdev_refcnt_read(dev); 7061 7062 while (refcnt != 0) { 7063 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 7064 rtnl_lock(); 7065 7066 /* Rebroadcast unregister notification */ 7067 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7068 7069 __rtnl_unlock(); 7070 rcu_barrier(); 7071 rtnl_lock(); 7072 7073 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7074 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 7075 &dev->state)) { 7076 /* We must not have linkwatch events 7077 * pending on unregister. If this 7078 * happens, we simply run the queue 7079 * unscheduled, resulting in a noop 7080 * for this device. 7081 */ 7082 linkwatch_run_queue(); 7083 } 7084 7085 __rtnl_unlock(); 7086 7087 rebroadcast_time = jiffies; 7088 } 7089 7090 msleep(250); 7091 7092 refcnt = netdev_refcnt_read(dev); 7093 7094 if (time_after(jiffies, warning_time + 10 * HZ)) { 7095 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 7096 dev->name, refcnt); 7097 warning_time = jiffies; 7098 } 7099 } 7100 } 7101 7102 /* The sequence is: 7103 * 7104 * rtnl_lock(); 7105 * ... 7106 * register_netdevice(x1); 7107 * register_netdevice(x2); 7108 * ... 7109 * unregister_netdevice(y1); 7110 * unregister_netdevice(y2); 7111 * ... 7112 * rtnl_unlock(); 7113 * free_netdev(y1); 7114 * free_netdev(y2); 7115 * 7116 * We are invoked by rtnl_unlock(). 7117 * This allows us to deal with problems: 7118 * 1) We can delete sysfs objects which invoke hotplug 7119 * without deadlocking with linkwatch via keventd. 7120 * 2) Since we run with the RTNL semaphore not held, we can sleep 7121 * safely in order to wait for the netdev refcnt to drop to zero. 7122 * 7123 * We must not return until all unregister events added during 7124 * the interval the lock was held have been completed. 7125 */ 7126 void netdev_run_todo(void) 7127 { 7128 struct list_head list; 7129 7130 /* Snapshot list, allow later requests */ 7131 list_replace_init(&net_todo_list, &list); 7132 7133 __rtnl_unlock(); 7134 7135 7136 /* Wait for rcu callbacks to finish before next phase */ 7137 if (!list_empty(&list)) 7138 rcu_barrier(); 7139 7140 while (!list_empty(&list)) { 7141 struct net_device *dev 7142 = list_first_entry(&list, struct net_device, todo_list); 7143 list_del(&dev->todo_list); 7144 7145 rtnl_lock(); 7146 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7147 __rtnl_unlock(); 7148 7149 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 7150 pr_err("network todo '%s' but state %d\n", 7151 dev->name, dev->reg_state); 7152 dump_stack(); 7153 continue; 7154 } 7155 7156 dev->reg_state = NETREG_UNREGISTERED; 7157 7158 netdev_wait_allrefs(dev); 7159 7160 /* paranoia */ 7161 BUG_ON(netdev_refcnt_read(dev)); 7162 BUG_ON(!list_empty(&dev->ptype_all)); 7163 BUG_ON(!list_empty(&dev->ptype_specific)); 7164 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 7165 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 7166 WARN_ON(dev->dn_ptr); 7167 7168 if (dev->destructor) 7169 dev->destructor(dev); 7170 7171 /* Report a network device has been unregistered */ 7172 rtnl_lock(); 7173 dev_net(dev)->dev_unreg_count--; 7174 __rtnl_unlock(); 7175 wake_up(&netdev_unregistering_wq); 7176 7177 /* Free network device */ 7178 kobject_put(&dev->dev.kobj); 7179 } 7180 } 7181 7182 /* Convert net_device_stats to rtnl_link_stats64. They have the same 7183 * fields in the same order, with only the type differing. 7184 */ 7185 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 7186 const struct net_device_stats *netdev_stats) 7187 { 7188 #if BITS_PER_LONG == 64 7189 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); 7190 memcpy(stats64, netdev_stats, sizeof(*stats64)); 7191 #else 7192 size_t i, n = sizeof(*stats64) / sizeof(u64); 7193 const unsigned long *src = (const unsigned long *)netdev_stats; 7194 u64 *dst = (u64 *)stats64; 7195 7196 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != 7197 sizeof(*stats64) / sizeof(u64)); 7198 for (i = 0; i < n; i++) 7199 dst[i] = src[i]; 7200 #endif 7201 } 7202 EXPORT_SYMBOL(netdev_stats_to_stats64); 7203 7204 /** 7205 * dev_get_stats - get network device statistics 7206 * @dev: device to get statistics from 7207 * @storage: place to store stats 7208 * 7209 * Get network statistics from device. Return @storage. 7210 * The device driver may provide its own method by setting 7211 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 7212 * otherwise the internal statistics structure is used. 7213 */ 7214 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 7215 struct rtnl_link_stats64 *storage) 7216 { 7217 const struct net_device_ops *ops = dev->netdev_ops; 7218 7219 if (ops->ndo_get_stats64) { 7220 memset(storage, 0, sizeof(*storage)); 7221 ops->ndo_get_stats64(dev, storage); 7222 } else if (ops->ndo_get_stats) { 7223 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 7224 } else { 7225 netdev_stats_to_stats64(storage, &dev->stats); 7226 } 7227 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 7228 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 7229 return storage; 7230 } 7231 EXPORT_SYMBOL(dev_get_stats); 7232 7233 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 7234 { 7235 struct netdev_queue *queue = dev_ingress_queue(dev); 7236 7237 #ifdef CONFIG_NET_CLS_ACT 7238 if (queue) 7239 return queue; 7240 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 7241 if (!queue) 7242 return NULL; 7243 netdev_init_one_queue(dev, queue, NULL); 7244 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 7245 queue->qdisc_sleeping = &noop_qdisc; 7246 rcu_assign_pointer(dev->ingress_queue, queue); 7247 #endif 7248 return queue; 7249 } 7250 7251 static const struct ethtool_ops default_ethtool_ops; 7252 7253 void netdev_set_default_ethtool_ops(struct net_device *dev, 7254 const struct ethtool_ops *ops) 7255 { 7256 if (dev->ethtool_ops == &default_ethtool_ops) 7257 dev->ethtool_ops = ops; 7258 } 7259 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 7260 7261 void netdev_freemem(struct net_device *dev) 7262 { 7263 char *addr = (char *)dev - dev->padded; 7264 7265 kvfree(addr); 7266 } 7267 7268 /** 7269 * alloc_netdev_mqs - allocate network device 7270 * @sizeof_priv: size of private data to allocate space for 7271 * @name: device name format string 7272 * @name_assign_type: origin of device name 7273 * @setup: callback to initialize device 7274 * @txqs: the number of TX subqueues to allocate 7275 * @rxqs: the number of RX subqueues to allocate 7276 * 7277 * Allocates a struct net_device with private data area for driver use 7278 * and performs basic initialization. Also allocates subqueue structs 7279 * for each queue on the device. 7280 */ 7281 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 7282 unsigned char name_assign_type, 7283 void (*setup)(struct net_device *), 7284 unsigned int txqs, unsigned int rxqs) 7285 { 7286 struct net_device *dev; 7287 size_t alloc_size; 7288 struct net_device *p; 7289 7290 BUG_ON(strlen(name) >= sizeof(dev->name)); 7291 7292 if (txqs < 1) { 7293 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 7294 return NULL; 7295 } 7296 7297 #ifdef CONFIG_SYSFS 7298 if (rxqs < 1) { 7299 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 7300 return NULL; 7301 } 7302 #endif 7303 7304 alloc_size = sizeof(struct net_device); 7305 if (sizeof_priv) { 7306 /* ensure 32-byte alignment of private area */ 7307 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 7308 alloc_size += sizeof_priv; 7309 } 7310 /* ensure 32-byte alignment of whole construct */ 7311 alloc_size += NETDEV_ALIGN - 1; 7312 7313 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 7314 if (!p) 7315 p = vzalloc(alloc_size); 7316 if (!p) 7317 return NULL; 7318 7319 dev = PTR_ALIGN(p, NETDEV_ALIGN); 7320 dev->padded = (char *)dev - (char *)p; 7321 7322 dev->pcpu_refcnt = alloc_percpu(int); 7323 if (!dev->pcpu_refcnt) 7324 goto free_dev; 7325 7326 if (dev_addr_init(dev)) 7327 goto free_pcpu; 7328 7329 dev_mc_init(dev); 7330 dev_uc_init(dev); 7331 7332 dev_net_set(dev, &init_net); 7333 7334 dev->gso_max_size = GSO_MAX_SIZE; 7335 dev->gso_max_segs = GSO_MAX_SEGS; 7336 dev->gso_min_segs = 0; 7337 7338 INIT_LIST_HEAD(&dev->napi_list); 7339 INIT_LIST_HEAD(&dev->unreg_list); 7340 INIT_LIST_HEAD(&dev->close_list); 7341 INIT_LIST_HEAD(&dev->link_watch_list); 7342 INIT_LIST_HEAD(&dev->adj_list.upper); 7343 INIT_LIST_HEAD(&dev->adj_list.lower); 7344 INIT_LIST_HEAD(&dev->all_adj_list.upper); 7345 INIT_LIST_HEAD(&dev->all_adj_list.lower); 7346 INIT_LIST_HEAD(&dev->ptype_all); 7347 INIT_LIST_HEAD(&dev->ptype_specific); 7348 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 7349 setup(dev); 7350 7351 if (!dev->tx_queue_len) 7352 dev->priv_flags |= IFF_NO_QUEUE; 7353 7354 dev->num_tx_queues = txqs; 7355 dev->real_num_tx_queues = txqs; 7356 if (netif_alloc_netdev_queues(dev)) 7357 goto free_all; 7358 7359 #ifdef CONFIG_SYSFS 7360 dev->num_rx_queues = rxqs; 7361 dev->real_num_rx_queues = rxqs; 7362 if (netif_alloc_rx_queues(dev)) 7363 goto free_all; 7364 #endif 7365 7366 strcpy(dev->name, name); 7367 dev->name_assign_type = name_assign_type; 7368 dev->group = INIT_NETDEV_GROUP; 7369 if (!dev->ethtool_ops) 7370 dev->ethtool_ops = &default_ethtool_ops; 7371 7372 nf_hook_ingress_init(dev); 7373 7374 return dev; 7375 7376 free_all: 7377 free_netdev(dev); 7378 return NULL; 7379 7380 free_pcpu: 7381 free_percpu(dev->pcpu_refcnt); 7382 free_dev: 7383 netdev_freemem(dev); 7384 return NULL; 7385 } 7386 EXPORT_SYMBOL(alloc_netdev_mqs); 7387 7388 /** 7389 * free_netdev - free network device 7390 * @dev: device 7391 * 7392 * This function does the last stage of destroying an allocated device 7393 * interface. The reference to the device object is released. 7394 * If this is the last reference then it will be freed. 7395 * Must be called in process context. 7396 */ 7397 void free_netdev(struct net_device *dev) 7398 { 7399 struct napi_struct *p, *n; 7400 7401 might_sleep(); 7402 netif_free_tx_queues(dev); 7403 #ifdef CONFIG_SYSFS 7404 kvfree(dev->_rx); 7405 #endif 7406 7407 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 7408 7409 /* Flush device addresses */ 7410 dev_addr_flush(dev); 7411 7412 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 7413 netif_napi_del(p); 7414 7415 free_percpu(dev->pcpu_refcnt); 7416 dev->pcpu_refcnt = NULL; 7417 7418 /* Compatibility with error handling in drivers */ 7419 if (dev->reg_state == NETREG_UNINITIALIZED) { 7420 netdev_freemem(dev); 7421 return; 7422 } 7423 7424 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 7425 dev->reg_state = NETREG_RELEASED; 7426 7427 /* will free via device release */ 7428 put_device(&dev->dev); 7429 } 7430 EXPORT_SYMBOL(free_netdev); 7431 7432 /** 7433 * synchronize_net - Synchronize with packet receive processing 7434 * 7435 * Wait for packets currently being received to be done. 7436 * Does not block later packets from starting. 7437 */ 7438 void synchronize_net(void) 7439 { 7440 might_sleep(); 7441 if (rtnl_is_locked()) 7442 synchronize_rcu_expedited(); 7443 else 7444 synchronize_rcu(); 7445 } 7446 EXPORT_SYMBOL(synchronize_net); 7447 7448 /** 7449 * unregister_netdevice_queue - remove device from the kernel 7450 * @dev: device 7451 * @head: list 7452 * 7453 * This function shuts down a device interface and removes it 7454 * from the kernel tables. 7455 * If head not NULL, device is queued to be unregistered later. 7456 * 7457 * Callers must hold the rtnl semaphore. You may want 7458 * unregister_netdev() instead of this. 7459 */ 7460 7461 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 7462 { 7463 ASSERT_RTNL(); 7464 7465 if (head) { 7466 list_move_tail(&dev->unreg_list, head); 7467 } else { 7468 rollback_registered(dev); 7469 /* Finish processing unregister after unlock */ 7470 net_set_todo(dev); 7471 } 7472 } 7473 EXPORT_SYMBOL(unregister_netdevice_queue); 7474 7475 /** 7476 * unregister_netdevice_many - unregister many devices 7477 * @head: list of devices 7478 * 7479 * Note: As most callers use a stack allocated list_head, 7480 * we force a list_del() to make sure stack wont be corrupted later. 7481 */ 7482 void unregister_netdevice_many(struct list_head *head) 7483 { 7484 struct net_device *dev; 7485 7486 if (!list_empty(head)) { 7487 rollback_registered_many(head); 7488 list_for_each_entry(dev, head, unreg_list) 7489 net_set_todo(dev); 7490 list_del(head); 7491 } 7492 } 7493 EXPORT_SYMBOL(unregister_netdevice_many); 7494 7495 /** 7496 * unregister_netdev - remove device from the kernel 7497 * @dev: device 7498 * 7499 * This function shuts down a device interface and removes it 7500 * from the kernel tables. 7501 * 7502 * This is just a wrapper for unregister_netdevice that takes 7503 * the rtnl semaphore. In general you want to use this and not 7504 * unregister_netdevice. 7505 */ 7506 void unregister_netdev(struct net_device *dev) 7507 { 7508 rtnl_lock(); 7509 unregister_netdevice(dev); 7510 rtnl_unlock(); 7511 } 7512 EXPORT_SYMBOL(unregister_netdev); 7513 7514 /** 7515 * dev_change_net_namespace - move device to different nethost namespace 7516 * @dev: device 7517 * @net: network namespace 7518 * @pat: If not NULL name pattern to try if the current device name 7519 * is already taken in the destination network namespace. 7520 * 7521 * This function shuts down a device interface and moves it 7522 * to a new network namespace. On success 0 is returned, on 7523 * a failure a netagive errno code is returned. 7524 * 7525 * Callers must hold the rtnl semaphore. 7526 */ 7527 7528 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 7529 { 7530 int err; 7531 7532 ASSERT_RTNL(); 7533 7534 /* Don't allow namespace local devices to be moved. */ 7535 err = -EINVAL; 7536 if (dev->features & NETIF_F_NETNS_LOCAL) 7537 goto out; 7538 7539 /* Ensure the device has been registrered */ 7540 if (dev->reg_state != NETREG_REGISTERED) 7541 goto out; 7542 7543 /* Get out if there is nothing todo */ 7544 err = 0; 7545 if (net_eq(dev_net(dev), net)) 7546 goto out; 7547 7548 /* Pick the destination device name, and ensure 7549 * we can use it in the destination network namespace. 7550 */ 7551 err = -EEXIST; 7552 if (__dev_get_by_name(net, dev->name)) { 7553 /* We get here if we can't use the current device name */ 7554 if (!pat) 7555 goto out; 7556 if (dev_get_valid_name(net, dev, pat) < 0) 7557 goto out; 7558 } 7559 7560 /* 7561 * And now a mini version of register_netdevice unregister_netdevice. 7562 */ 7563 7564 /* If device is running close it first. */ 7565 dev_close(dev); 7566 7567 /* And unlink it from device chain */ 7568 err = -ENODEV; 7569 unlist_netdevice(dev); 7570 7571 synchronize_net(); 7572 7573 /* Shutdown queueing discipline. */ 7574 dev_shutdown(dev); 7575 7576 /* Notify protocols, that we are about to destroy 7577 this device. They should clean all the things. 7578 7579 Note that dev->reg_state stays at NETREG_REGISTERED. 7580 This is wanted because this way 8021q and macvlan know 7581 the device is just moving and can keep their slaves up. 7582 */ 7583 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7584 rcu_barrier(); 7585 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7586 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 7587 7588 /* 7589 * Flush the unicast and multicast chains 7590 */ 7591 dev_uc_flush(dev); 7592 dev_mc_flush(dev); 7593 7594 /* Send a netdev-removed uevent to the old namespace */ 7595 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 7596 netdev_adjacent_del_links(dev); 7597 7598 /* Actually switch the network namespace */ 7599 dev_net_set(dev, net); 7600 7601 /* If there is an ifindex conflict assign a new one */ 7602 if (__dev_get_by_index(net, dev->ifindex)) 7603 dev->ifindex = dev_new_index(net); 7604 7605 /* Send a netdev-add uevent to the new namespace */ 7606 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 7607 netdev_adjacent_add_links(dev); 7608 7609 /* Fixup kobjects */ 7610 err = device_rename(&dev->dev, dev->name); 7611 WARN_ON(err); 7612 7613 /* Add the device back in the hashes */ 7614 list_netdevice(dev); 7615 7616 /* Notify protocols, that a new device appeared. */ 7617 call_netdevice_notifiers(NETDEV_REGISTER, dev); 7618 7619 /* 7620 * Prevent userspace races by waiting until the network 7621 * device is fully setup before sending notifications. 7622 */ 7623 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7624 7625 synchronize_net(); 7626 err = 0; 7627 out: 7628 return err; 7629 } 7630 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 7631 7632 static int dev_cpu_callback(struct notifier_block *nfb, 7633 unsigned long action, 7634 void *ocpu) 7635 { 7636 struct sk_buff **list_skb; 7637 struct sk_buff *skb; 7638 unsigned int cpu, oldcpu = (unsigned long)ocpu; 7639 struct softnet_data *sd, *oldsd; 7640 7641 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 7642 return NOTIFY_OK; 7643 7644 local_irq_disable(); 7645 cpu = smp_processor_id(); 7646 sd = &per_cpu(softnet_data, cpu); 7647 oldsd = &per_cpu(softnet_data, oldcpu); 7648 7649 /* Find end of our completion_queue. */ 7650 list_skb = &sd->completion_queue; 7651 while (*list_skb) 7652 list_skb = &(*list_skb)->next; 7653 /* Append completion queue from offline CPU. */ 7654 *list_skb = oldsd->completion_queue; 7655 oldsd->completion_queue = NULL; 7656 7657 /* Append output queue from offline CPU. */ 7658 if (oldsd->output_queue) { 7659 *sd->output_queue_tailp = oldsd->output_queue; 7660 sd->output_queue_tailp = oldsd->output_queue_tailp; 7661 oldsd->output_queue = NULL; 7662 oldsd->output_queue_tailp = &oldsd->output_queue; 7663 } 7664 /* Append NAPI poll list from offline CPU, with one exception : 7665 * process_backlog() must be called by cpu owning percpu backlog. 7666 * We properly handle process_queue & input_pkt_queue later. 7667 */ 7668 while (!list_empty(&oldsd->poll_list)) { 7669 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 7670 struct napi_struct, 7671 poll_list); 7672 7673 list_del_init(&napi->poll_list); 7674 if (napi->poll == process_backlog) 7675 napi->state = 0; 7676 else 7677 ____napi_schedule(sd, napi); 7678 } 7679 7680 raise_softirq_irqoff(NET_TX_SOFTIRQ); 7681 local_irq_enable(); 7682 7683 /* Process offline CPU's input_pkt_queue */ 7684 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 7685 netif_rx_ni(skb); 7686 input_queue_head_incr(oldsd); 7687 } 7688 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 7689 netif_rx_ni(skb); 7690 input_queue_head_incr(oldsd); 7691 } 7692 7693 return NOTIFY_OK; 7694 } 7695 7696 7697 /** 7698 * netdev_increment_features - increment feature set by one 7699 * @all: current feature set 7700 * @one: new feature set 7701 * @mask: mask feature set 7702 * 7703 * Computes a new feature set after adding a device with feature set 7704 * @one to the master device with current feature set @all. Will not 7705 * enable anything that is off in @mask. Returns the new feature set. 7706 */ 7707 netdev_features_t netdev_increment_features(netdev_features_t all, 7708 netdev_features_t one, netdev_features_t mask) 7709 { 7710 if (mask & NETIF_F_HW_CSUM) 7711 mask |= NETIF_F_CSUM_MASK; 7712 mask |= NETIF_F_VLAN_CHALLENGED; 7713 7714 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; 7715 all &= one | ~NETIF_F_ALL_FOR_ALL; 7716 7717 /* If one device supports hw checksumming, set for all. */ 7718 if (all & NETIF_F_HW_CSUM) 7719 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); 7720 7721 return all; 7722 } 7723 EXPORT_SYMBOL(netdev_increment_features); 7724 7725 static struct hlist_head * __net_init netdev_create_hash(void) 7726 { 7727 int i; 7728 struct hlist_head *hash; 7729 7730 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 7731 if (hash != NULL) 7732 for (i = 0; i < NETDEV_HASHENTRIES; i++) 7733 INIT_HLIST_HEAD(&hash[i]); 7734 7735 return hash; 7736 } 7737 7738 /* Initialize per network namespace state */ 7739 static int __net_init netdev_init(struct net *net) 7740 { 7741 if (net != &init_net) 7742 INIT_LIST_HEAD(&net->dev_base_head); 7743 7744 net->dev_name_head = netdev_create_hash(); 7745 if (net->dev_name_head == NULL) 7746 goto err_name; 7747 7748 net->dev_index_head = netdev_create_hash(); 7749 if (net->dev_index_head == NULL) 7750 goto err_idx; 7751 7752 return 0; 7753 7754 err_idx: 7755 kfree(net->dev_name_head); 7756 err_name: 7757 return -ENOMEM; 7758 } 7759 7760 /** 7761 * netdev_drivername - network driver for the device 7762 * @dev: network device 7763 * 7764 * Determine network driver for device. 7765 */ 7766 const char *netdev_drivername(const struct net_device *dev) 7767 { 7768 const struct device_driver *driver; 7769 const struct device *parent; 7770 const char *empty = ""; 7771 7772 parent = dev->dev.parent; 7773 if (!parent) 7774 return empty; 7775 7776 driver = parent->driver; 7777 if (driver && driver->name) 7778 return driver->name; 7779 return empty; 7780 } 7781 7782 static void __netdev_printk(const char *level, const struct net_device *dev, 7783 struct va_format *vaf) 7784 { 7785 if (dev && dev->dev.parent) { 7786 dev_printk_emit(level[1] - '0', 7787 dev->dev.parent, 7788 "%s %s %s%s: %pV", 7789 dev_driver_string(dev->dev.parent), 7790 dev_name(dev->dev.parent), 7791 netdev_name(dev), netdev_reg_state(dev), 7792 vaf); 7793 } else if (dev) { 7794 printk("%s%s%s: %pV", 7795 level, netdev_name(dev), netdev_reg_state(dev), vaf); 7796 } else { 7797 printk("%s(NULL net_device): %pV", level, vaf); 7798 } 7799 } 7800 7801 void netdev_printk(const char *level, const struct net_device *dev, 7802 const char *format, ...) 7803 { 7804 struct va_format vaf; 7805 va_list args; 7806 7807 va_start(args, format); 7808 7809 vaf.fmt = format; 7810 vaf.va = &args; 7811 7812 __netdev_printk(level, dev, &vaf); 7813 7814 va_end(args); 7815 } 7816 EXPORT_SYMBOL(netdev_printk); 7817 7818 #define define_netdev_printk_level(func, level) \ 7819 void func(const struct net_device *dev, const char *fmt, ...) \ 7820 { \ 7821 struct va_format vaf; \ 7822 va_list args; \ 7823 \ 7824 va_start(args, fmt); \ 7825 \ 7826 vaf.fmt = fmt; \ 7827 vaf.va = &args; \ 7828 \ 7829 __netdev_printk(level, dev, &vaf); \ 7830 \ 7831 va_end(args); \ 7832 } \ 7833 EXPORT_SYMBOL(func); 7834 7835 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 7836 define_netdev_printk_level(netdev_alert, KERN_ALERT); 7837 define_netdev_printk_level(netdev_crit, KERN_CRIT); 7838 define_netdev_printk_level(netdev_err, KERN_ERR); 7839 define_netdev_printk_level(netdev_warn, KERN_WARNING); 7840 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 7841 define_netdev_printk_level(netdev_info, KERN_INFO); 7842 7843 static void __net_exit netdev_exit(struct net *net) 7844 { 7845 kfree(net->dev_name_head); 7846 kfree(net->dev_index_head); 7847 } 7848 7849 static struct pernet_operations __net_initdata netdev_net_ops = { 7850 .init = netdev_init, 7851 .exit = netdev_exit, 7852 }; 7853 7854 static void __net_exit default_device_exit(struct net *net) 7855 { 7856 struct net_device *dev, *aux; 7857 /* 7858 * Push all migratable network devices back to the 7859 * initial network namespace 7860 */ 7861 rtnl_lock(); 7862 for_each_netdev_safe(net, dev, aux) { 7863 int err; 7864 char fb_name[IFNAMSIZ]; 7865 7866 /* Ignore unmoveable devices (i.e. loopback) */ 7867 if (dev->features & NETIF_F_NETNS_LOCAL) 7868 continue; 7869 7870 /* Leave virtual devices for the generic cleanup */ 7871 if (dev->rtnl_link_ops) 7872 continue; 7873 7874 /* Push remaining network devices to init_net */ 7875 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 7876 err = dev_change_net_namespace(dev, &init_net, fb_name); 7877 if (err) { 7878 pr_emerg("%s: failed to move %s to init_net: %d\n", 7879 __func__, dev->name, err); 7880 BUG(); 7881 } 7882 } 7883 rtnl_unlock(); 7884 } 7885 7886 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 7887 { 7888 /* Return with the rtnl_lock held when there are no network 7889 * devices unregistering in any network namespace in net_list. 7890 */ 7891 struct net *net; 7892 bool unregistering; 7893 DEFINE_WAIT_FUNC(wait, woken_wake_function); 7894 7895 add_wait_queue(&netdev_unregistering_wq, &wait); 7896 for (;;) { 7897 unregistering = false; 7898 rtnl_lock(); 7899 list_for_each_entry(net, net_list, exit_list) { 7900 if (net->dev_unreg_count > 0) { 7901 unregistering = true; 7902 break; 7903 } 7904 } 7905 if (!unregistering) 7906 break; 7907 __rtnl_unlock(); 7908 7909 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 7910 } 7911 remove_wait_queue(&netdev_unregistering_wq, &wait); 7912 } 7913 7914 static void __net_exit default_device_exit_batch(struct list_head *net_list) 7915 { 7916 /* At exit all network devices most be removed from a network 7917 * namespace. Do this in the reverse order of registration. 7918 * Do this across as many network namespaces as possible to 7919 * improve batching efficiency. 7920 */ 7921 struct net_device *dev; 7922 struct net *net; 7923 LIST_HEAD(dev_kill_list); 7924 7925 /* To prevent network device cleanup code from dereferencing 7926 * loopback devices or network devices that have been freed 7927 * wait here for all pending unregistrations to complete, 7928 * before unregistring the loopback device and allowing the 7929 * network namespace be freed. 7930 * 7931 * The netdev todo list containing all network devices 7932 * unregistrations that happen in default_device_exit_batch 7933 * will run in the rtnl_unlock() at the end of 7934 * default_device_exit_batch. 7935 */ 7936 rtnl_lock_unregistering(net_list); 7937 list_for_each_entry(net, net_list, exit_list) { 7938 for_each_netdev_reverse(net, dev) { 7939 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 7940 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 7941 else 7942 unregister_netdevice_queue(dev, &dev_kill_list); 7943 } 7944 } 7945 unregister_netdevice_many(&dev_kill_list); 7946 rtnl_unlock(); 7947 } 7948 7949 static struct pernet_operations __net_initdata default_device_ops = { 7950 .exit = default_device_exit, 7951 .exit_batch = default_device_exit_batch, 7952 }; 7953 7954 /* 7955 * Initialize the DEV module. At boot time this walks the device list and 7956 * unhooks any devices that fail to initialise (normally hardware not 7957 * present) and leaves us with a valid list of present and active devices. 7958 * 7959 */ 7960 7961 /* 7962 * This is called single threaded during boot, so no need 7963 * to take the rtnl semaphore. 7964 */ 7965 static int __init net_dev_init(void) 7966 { 7967 int i, rc = -ENOMEM; 7968 7969 BUG_ON(!dev_boot_phase); 7970 7971 if (dev_proc_init()) 7972 goto out; 7973 7974 if (netdev_kobject_init()) 7975 goto out; 7976 7977 INIT_LIST_HEAD(&ptype_all); 7978 for (i = 0; i < PTYPE_HASH_SIZE; i++) 7979 INIT_LIST_HEAD(&ptype_base[i]); 7980 7981 INIT_LIST_HEAD(&offload_base); 7982 7983 if (register_pernet_subsys(&netdev_net_ops)) 7984 goto out; 7985 7986 /* 7987 * Initialise the packet receive queues. 7988 */ 7989 7990 for_each_possible_cpu(i) { 7991 struct softnet_data *sd = &per_cpu(softnet_data, i); 7992 7993 skb_queue_head_init(&sd->input_pkt_queue); 7994 skb_queue_head_init(&sd->process_queue); 7995 INIT_LIST_HEAD(&sd->poll_list); 7996 sd->output_queue_tailp = &sd->output_queue; 7997 #ifdef CONFIG_RPS 7998 sd->csd.func = rps_trigger_softirq; 7999 sd->csd.info = sd; 8000 sd->cpu = i; 8001 #endif 8002 8003 sd->backlog.poll = process_backlog; 8004 sd->backlog.weight = weight_p; 8005 } 8006 8007 dev_boot_phase = 0; 8008 8009 /* The loopback device is special if any other network devices 8010 * is present in a network namespace the loopback device must 8011 * be present. Since we now dynamically allocate and free the 8012 * loopback device ensure this invariant is maintained by 8013 * keeping the loopback device as the first device on the 8014 * list of network devices. Ensuring the loopback devices 8015 * is the first device that appears and the last network device 8016 * that disappears. 8017 */ 8018 if (register_pernet_device(&loopback_net_ops)) 8019 goto out; 8020 8021 if (register_pernet_device(&default_device_ops)) 8022 goto out; 8023 8024 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 8025 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 8026 8027 hotcpu_notifier(dev_cpu_callback, 0); 8028 dst_subsys_init(); 8029 rc = 0; 8030 out: 8031 return rc; 8032 } 8033 8034 subsys_initcall(net_dev_init); 8035