1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <linux/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <linux/bpf.h> 98 #include <linux/bpf_trace.h> 99 #include <net/net_namespace.h> 100 #include <net/sock.h> 101 #include <net/busy_poll.h> 102 #include <linux/rtnetlink.h> 103 #include <linux/stat.h> 104 #include <net/dst.h> 105 #include <net/dst_metadata.h> 106 #include <net/pkt_sched.h> 107 #include <net/checksum.h> 108 #include <net/xfrm.h> 109 #include <linux/highmem.h> 110 #include <linux/init.h> 111 #include <linux/module.h> 112 #include <linux/netpoll.h> 113 #include <linux/rcupdate.h> 114 #include <linux/delay.h> 115 #include <net/iw_handler.h> 116 #include <asm/current.h> 117 #include <linux/audit.h> 118 #include <linux/dmaengine.h> 119 #include <linux/err.h> 120 #include <linux/ctype.h> 121 #include <linux/if_arp.h> 122 #include <linux/if_vlan.h> 123 #include <linux/ip.h> 124 #include <net/ip.h> 125 #include <net/mpls.h> 126 #include <linux/ipv6.h> 127 #include <linux/in.h> 128 #include <linux/jhash.h> 129 #include <linux/random.h> 130 #include <trace/events/napi.h> 131 #include <trace/events/net.h> 132 #include <trace/events/skb.h> 133 #include <linux/pci.h> 134 #include <linux/inetdevice.h> 135 #include <linux/cpu_rmap.h> 136 #include <linux/static_key.h> 137 #include <linux/hashtable.h> 138 #include <linux/vmalloc.h> 139 #include <linux/if_macvlan.h> 140 #include <linux/errqueue.h> 141 #include <linux/hrtimer.h> 142 #include <linux/netfilter_ingress.h> 143 #include <linux/crash_dump.h> 144 145 #include "net-sysfs.h" 146 147 /* Instead of increasing this, you should create a hash table. */ 148 #define MAX_GRO_SKBS 8 149 150 /* This should be increased if a protocol with a bigger head is added. */ 151 #define GRO_MAX_HEAD (MAX_HEADER + 128) 152 153 static DEFINE_SPINLOCK(ptype_lock); 154 static DEFINE_SPINLOCK(offload_lock); 155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 156 struct list_head ptype_all __read_mostly; /* Taps */ 157 static struct list_head offload_base __read_mostly; 158 159 static int netif_rx_internal(struct sk_buff *skb); 160 static int call_netdevice_notifiers_info(unsigned long val, 161 struct net_device *dev, 162 struct netdev_notifier_info *info); 163 164 /* 165 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 166 * semaphore. 167 * 168 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 169 * 170 * Writers must hold the rtnl semaphore while they loop through the 171 * dev_base_head list, and hold dev_base_lock for writing when they do the 172 * actual updates. This allows pure readers to access the list even 173 * while a writer is preparing to update it. 174 * 175 * To put it another way, dev_base_lock is held for writing only to 176 * protect against pure readers; the rtnl semaphore provides the 177 * protection against other writers. 178 * 179 * See, for example usages, register_netdevice() and 180 * unregister_netdevice(), which must be called with the rtnl 181 * semaphore held. 182 */ 183 DEFINE_RWLOCK(dev_base_lock); 184 EXPORT_SYMBOL(dev_base_lock); 185 186 /* protects napi_hash addition/deletion and napi_gen_id */ 187 static DEFINE_SPINLOCK(napi_hash_lock); 188 189 static unsigned int napi_gen_id = NR_CPUS; 190 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); 191 192 static seqcount_t devnet_rename_seq; 193 194 static inline void dev_base_seq_inc(struct net *net) 195 { 196 while (++net->dev_base_seq == 0) 197 ; 198 } 199 200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 201 { 202 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); 203 204 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 205 } 206 207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 208 { 209 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 210 } 211 212 static inline void rps_lock(struct softnet_data *sd) 213 { 214 #ifdef CONFIG_RPS 215 spin_lock(&sd->input_pkt_queue.lock); 216 #endif 217 } 218 219 static inline void rps_unlock(struct softnet_data *sd) 220 { 221 #ifdef CONFIG_RPS 222 spin_unlock(&sd->input_pkt_queue.lock); 223 #endif 224 } 225 226 /* Device list insertion */ 227 static void list_netdevice(struct net_device *dev) 228 { 229 struct net *net = dev_net(dev); 230 231 ASSERT_RTNL(); 232 233 write_lock_bh(&dev_base_lock); 234 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 235 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 236 hlist_add_head_rcu(&dev->index_hlist, 237 dev_index_hash(net, dev->ifindex)); 238 write_unlock_bh(&dev_base_lock); 239 240 dev_base_seq_inc(net); 241 } 242 243 /* Device list removal 244 * caller must respect a RCU grace period before freeing/reusing dev 245 */ 246 static void unlist_netdevice(struct net_device *dev) 247 { 248 ASSERT_RTNL(); 249 250 /* Unlink dev from the device chain */ 251 write_lock_bh(&dev_base_lock); 252 list_del_rcu(&dev->dev_list); 253 hlist_del_rcu(&dev->name_hlist); 254 hlist_del_rcu(&dev->index_hlist); 255 write_unlock_bh(&dev_base_lock); 256 257 dev_base_seq_inc(dev_net(dev)); 258 } 259 260 /* 261 * Our notifier list 262 */ 263 264 static RAW_NOTIFIER_HEAD(netdev_chain); 265 266 /* 267 * Device drivers call our routines to queue packets here. We empty the 268 * queue in the local softnet handler. 269 */ 270 271 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 272 EXPORT_PER_CPU_SYMBOL(softnet_data); 273 274 #ifdef CONFIG_LOCKDEP 275 /* 276 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 277 * according to dev->type 278 */ 279 static const unsigned short netdev_lock_type[] = { 280 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 281 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 282 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 283 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 284 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 285 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 286 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 287 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 288 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 289 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 290 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 291 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 292 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 293 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 294 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 295 296 static const char *const netdev_lock_name[] = { 297 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 298 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 299 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 300 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 301 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 302 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 303 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 304 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 305 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 306 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 307 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 308 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 309 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 310 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 311 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 312 313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 315 316 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 317 { 318 int i; 319 320 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 321 if (netdev_lock_type[i] == dev_type) 322 return i; 323 /* the last key is used by default */ 324 return ARRAY_SIZE(netdev_lock_type) - 1; 325 } 326 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 328 unsigned short dev_type) 329 { 330 int i; 331 332 i = netdev_lock_pos(dev_type); 333 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 334 netdev_lock_name[i]); 335 } 336 337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 338 { 339 int i; 340 341 i = netdev_lock_pos(dev->type); 342 lockdep_set_class_and_name(&dev->addr_list_lock, 343 &netdev_addr_lock_key[i], 344 netdev_lock_name[i]); 345 } 346 #else 347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 348 unsigned short dev_type) 349 { 350 } 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 352 { 353 } 354 #endif 355 356 /******************************************************************************* 357 * 358 * Protocol management and registration routines 359 * 360 *******************************************************************************/ 361 362 363 /* 364 * Add a protocol ID to the list. Now that the input handler is 365 * smarter we can dispense with all the messy stuff that used to be 366 * here. 367 * 368 * BEWARE!!! Protocol handlers, mangling input packets, 369 * MUST BE last in hash buckets and checking protocol handlers 370 * MUST start from promiscuous ptype_all chain in net_bh. 371 * It is true now, do not change it. 372 * Explanation follows: if protocol handler, mangling packet, will 373 * be the first on list, it is not able to sense, that packet 374 * is cloned and should be copied-on-write, so that it will 375 * change it and subsequent readers will get broken packet. 376 * --ANK (980803) 377 */ 378 379 static inline struct list_head *ptype_head(const struct packet_type *pt) 380 { 381 if (pt->type == htons(ETH_P_ALL)) 382 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 383 else 384 return pt->dev ? &pt->dev->ptype_specific : 385 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 386 } 387 388 /** 389 * dev_add_pack - add packet handler 390 * @pt: packet type declaration 391 * 392 * Add a protocol handler to the networking stack. The passed &packet_type 393 * is linked into kernel lists and may not be freed until it has been 394 * removed from the kernel lists. 395 * 396 * This call does not sleep therefore it can not 397 * guarantee all CPU's that are in middle of receiving packets 398 * will see the new packet type (until the next received packet). 399 */ 400 401 void dev_add_pack(struct packet_type *pt) 402 { 403 struct list_head *head = ptype_head(pt); 404 405 spin_lock(&ptype_lock); 406 list_add_rcu(&pt->list, head); 407 spin_unlock(&ptype_lock); 408 } 409 EXPORT_SYMBOL(dev_add_pack); 410 411 /** 412 * __dev_remove_pack - remove packet handler 413 * @pt: packet type declaration 414 * 415 * Remove a protocol handler that was previously added to the kernel 416 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 417 * from the kernel lists and can be freed or reused once this function 418 * returns. 419 * 420 * The packet type might still be in use by receivers 421 * and must not be freed until after all the CPU's have gone 422 * through a quiescent state. 423 */ 424 void __dev_remove_pack(struct packet_type *pt) 425 { 426 struct list_head *head = ptype_head(pt); 427 struct packet_type *pt1; 428 429 spin_lock(&ptype_lock); 430 431 list_for_each_entry(pt1, head, list) { 432 if (pt == pt1) { 433 list_del_rcu(&pt->list); 434 goto out; 435 } 436 } 437 438 pr_warn("dev_remove_pack: %p not found\n", pt); 439 out: 440 spin_unlock(&ptype_lock); 441 } 442 EXPORT_SYMBOL(__dev_remove_pack); 443 444 /** 445 * dev_remove_pack - remove packet handler 446 * @pt: packet type declaration 447 * 448 * Remove a protocol handler that was previously added to the kernel 449 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 450 * from the kernel lists and can be freed or reused once this function 451 * returns. 452 * 453 * This call sleeps to guarantee that no CPU is looking at the packet 454 * type after return. 455 */ 456 void dev_remove_pack(struct packet_type *pt) 457 { 458 __dev_remove_pack(pt); 459 460 synchronize_net(); 461 } 462 EXPORT_SYMBOL(dev_remove_pack); 463 464 465 /** 466 * dev_add_offload - register offload handlers 467 * @po: protocol offload declaration 468 * 469 * Add protocol offload handlers to the networking stack. The passed 470 * &proto_offload is linked into kernel lists and may not be freed until 471 * it has been removed from the kernel lists. 472 * 473 * This call does not sleep therefore it can not 474 * guarantee all CPU's that are in middle of receiving packets 475 * will see the new offload handlers (until the next received packet). 476 */ 477 void dev_add_offload(struct packet_offload *po) 478 { 479 struct packet_offload *elem; 480 481 spin_lock(&offload_lock); 482 list_for_each_entry(elem, &offload_base, list) { 483 if (po->priority < elem->priority) 484 break; 485 } 486 list_add_rcu(&po->list, elem->list.prev); 487 spin_unlock(&offload_lock); 488 } 489 EXPORT_SYMBOL(dev_add_offload); 490 491 /** 492 * __dev_remove_offload - remove offload handler 493 * @po: packet offload declaration 494 * 495 * Remove a protocol offload handler that was previously added to the 496 * kernel offload handlers by dev_add_offload(). The passed &offload_type 497 * is removed from the kernel lists and can be freed or reused once this 498 * function returns. 499 * 500 * The packet type might still be in use by receivers 501 * and must not be freed until after all the CPU's have gone 502 * through a quiescent state. 503 */ 504 static void __dev_remove_offload(struct packet_offload *po) 505 { 506 struct list_head *head = &offload_base; 507 struct packet_offload *po1; 508 509 spin_lock(&offload_lock); 510 511 list_for_each_entry(po1, head, list) { 512 if (po == po1) { 513 list_del_rcu(&po->list); 514 goto out; 515 } 516 } 517 518 pr_warn("dev_remove_offload: %p not found\n", po); 519 out: 520 spin_unlock(&offload_lock); 521 } 522 523 /** 524 * dev_remove_offload - remove packet offload handler 525 * @po: packet offload declaration 526 * 527 * Remove a packet offload handler that was previously added to the kernel 528 * offload handlers by dev_add_offload(). The passed &offload_type is 529 * removed from the kernel lists and can be freed or reused once this 530 * function returns. 531 * 532 * This call sleeps to guarantee that no CPU is looking at the packet 533 * type after return. 534 */ 535 void dev_remove_offload(struct packet_offload *po) 536 { 537 __dev_remove_offload(po); 538 539 synchronize_net(); 540 } 541 EXPORT_SYMBOL(dev_remove_offload); 542 543 /****************************************************************************** 544 * 545 * Device Boot-time Settings Routines 546 * 547 ******************************************************************************/ 548 549 /* Boot time configuration table */ 550 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 551 552 /** 553 * netdev_boot_setup_add - add new setup entry 554 * @name: name of the device 555 * @map: configured settings for the device 556 * 557 * Adds new setup entry to the dev_boot_setup list. The function 558 * returns 0 on error and 1 on success. This is a generic routine to 559 * all netdevices. 560 */ 561 static int netdev_boot_setup_add(char *name, struct ifmap *map) 562 { 563 struct netdev_boot_setup *s; 564 int i; 565 566 s = dev_boot_setup; 567 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 568 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 569 memset(s[i].name, 0, sizeof(s[i].name)); 570 strlcpy(s[i].name, name, IFNAMSIZ); 571 memcpy(&s[i].map, map, sizeof(s[i].map)); 572 break; 573 } 574 } 575 576 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 577 } 578 579 /** 580 * netdev_boot_setup_check - check boot time settings 581 * @dev: the netdevice 582 * 583 * Check boot time settings for the device. 584 * The found settings are set for the device to be used 585 * later in the device probing. 586 * Returns 0 if no settings found, 1 if they are. 587 */ 588 int netdev_boot_setup_check(struct net_device *dev) 589 { 590 struct netdev_boot_setup *s = dev_boot_setup; 591 int i; 592 593 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 594 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 595 !strcmp(dev->name, s[i].name)) { 596 dev->irq = s[i].map.irq; 597 dev->base_addr = s[i].map.base_addr; 598 dev->mem_start = s[i].map.mem_start; 599 dev->mem_end = s[i].map.mem_end; 600 return 1; 601 } 602 } 603 return 0; 604 } 605 EXPORT_SYMBOL(netdev_boot_setup_check); 606 607 608 /** 609 * netdev_boot_base - get address from boot time settings 610 * @prefix: prefix for network device 611 * @unit: id for network device 612 * 613 * Check boot time settings for the base address of device. 614 * The found settings are set for the device to be used 615 * later in the device probing. 616 * Returns 0 if no settings found. 617 */ 618 unsigned long netdev_boot_base(const char *prefix, int unit) 619 { 620 const struct netdev_boot_setup *s = dev_boot_setup; 621 char name[IFNAMSIZ]; 622 int i; 623 624 sprintf(name, "%s%d", prefix, unit); 625 626 /* 627 * If device already registered then return base of 1 628 * to indicate not to probe for this interface 629 */ 630 if (__dev_get_by_name(&init_net, name)) 631 return 1; 632 633 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 634 if (!strcmp(name, s[i].name)) 635 return s[i].map.base_addr; 636 return 0; 637 } 638 639 /* 640 * Saves at boot time configured settings for any netdevice. 641 */ 642 int __init netdev_boot_setup(char *str) 643 { 644 int ints[5]; 645 struct ifmap map; 646 647 str = get_options(str, ARRAY_SIZE(ints), ints); 648 if (!str || !*str) 649 return 0; 650 651 /* Save settings */ 652 memset(&map, 0, sizeof(map)); 653 if (ints[0] > 0) 654 map.irq = ints[1]; 655 if (ints[0] > 1) 656 map.base_addr = ints[2]; 657 if (ints[0] > 2) 658 map.mem_start = ints[3]; 659 if (ints[0] > 3) 660 map.mem_end = ints[4]; 661 662 /* Add new entry to the list */ 663 return netdev_boot_setup_add(str, &map); 664 } 665 666 __setup("netdev=", netdev_boot_setup); 667 668 /******************************************************************************* 669 * 670 * Device Interface Subroutines 671 * 672 *******************************************************************************/ 673 674 /** 675 * dev_get_iflink - get 'iflink' value of a interface 676 * @dev: targeted interface 677 * 678 * Indicates the ifindex the interface is linked to. 679 * Physical interfaces have the same 'ifindex' and 'iflink' values. 680 */ 681 682 int dev_get_iflink(const struct net_device *dev) 683 { 684 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) 685 return dev->netdev_ops->ndo_get_iflink(dev); 686 687 return dev->ifindex; 688 } 689 EXPORT_SYMBOL(dev_get_iflink); 690 691 /** 692 * dev_fill_metadata_dst - Retrieve tunnel egress information. 693 * @dev: targeted interface 694 * @skb: The packet. 695 * 696 * For better visibility of tunnel traffic OVS needs to retrieve 697 * egress tunnel information for a packet. Following API allows 698 * user to get this info. 699 */ 700 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 701 { 702 struct ip_tunnel_info *info; 703 704 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) 705 return -EINVAL; 706 707 info = skb_tunnel_info_unclone(skb); 708 if (!info) 709 return -ENOMEM; 710 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) 711 return -EINVAL; 712 713 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); 714 } 715 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); 716 717 /** 718 * __dev_get_by_name - find a device by its name 719 * @net: the applicable net namespace 720 * @name: name to find 721 * 722 * Find an interface by name. Must be called under RTNL semaphore 723 * or @dev_base_lock. If the name is found a pointer to the device 724 * is returned. If the name is not found then %NULL is returned. The 725 * reference counters are not incremented so the caller must be 726 * careful with locks. 727 */ 728 729 struct net_device *__dev_get_by_name(struct net *net, const char *name) 730 { 731 struct net_device *dev; 732 struct hlist_head *head = dev_name_hash(net, name); 733 734 hlist_for_each_entry(dev, head, name_hlist) 735 if (!strncmp(dev->name, name, IFNAMSIZ)) 736 return dev; 737 738 return NULL; 739 } 740 EXPORT_SYMBOL(__dev_get_by_name); 741 742 /** 743 * dev_get_by_name_rcu - find a device by its name 744 * @net: the applicable net namespace 745 * @name: name to find 746 * 747 * Find an interface by name. 748 * If the name is found a pointer to the device is returned. 749 * If the name is not found then %NULL is returned. 750 * The reference counters are not incremented so the caller must be 751 * careful with locks. The caller must hold RCU lock. 752 */ 753 754 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 755 { 756 struct net_device *dev; 757 struct hlist_head *head = dev_name_hash(net, name); 758 759 hlist_for_each_entry_rcu(dev, head, name_hlist) 760 if (!strncmp(dev->name, name, IFNAMSIZ)) 761 return dev; 762 763 return NULL; 764 } 765 EXPORT_SYMBOL(dev_get_by_name_rcu); 766 767 /** 768 * dev_get_by_name - find a device by its name 769 * @net: the applicable net namespace 770 * @name: name to find 771 * 772 * Find an interface by name. This can be called from any 773 * context and does its own locking. The returned handle has 774 * the usage count incremented and the caller must use dev_put() to 775 * release it when it is no longer needed. %NULL is returned if no 776 * matching device is found. 777 */ 778 779 struct net_device *dev_get_by_name(struct net *net, const char *name) 780 { 781 struct net_device *dev; 782 783 rcu_read_lock(); 784 dev = dev_get_by_name_rcu(net, name); 785 if (dev) 786 dev_hold(dev); 787 rcu_read_unlock(); 788 return dev; 789 } 790 EXPORT_SYMBOL(dev_get_by_name); 791 792 /** 793 * __dev_get_by_index - find a device by its ifindex 794 * @net: the applicable net namespace 795 * @ifindex: index of device 796 * 797 * Search for an interface by index. Returns %NULL if the device 798 * is not found or a pointer to the device. The device has not 799 * had its reference counter increased so the caller must be careful 800 * about locking. The caller must hold either the RTNL semaphore 801 * or @dev_base_lock. 802 */ 803 804 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 805 { 806 struct net_device *dev; 807 struct hlist_head *head = dev_index_hash(net, ifindex); 808 809 hlist_for_each_entry(dev, head, index_hlist) 810 if (dev->ifindex == ifindex) 811 return dev; 812 813 return NULL; 814 } 815 EXPORT_SYMBOL(__dev_get_by_index); 816 817 /** 818 * dev_get_by_index_rcu - find a device by its ifindex 819 * @net: the applicable net namespace 820 * @ifindex: index of device 821 * 822 * Search for an interface by index. Returns %NULL if the device 823 * is not found or a pointer to the device. The device has not 824 * had its reference counter increased so the caller must be careful 825 * about locking. The caller must hold RCU lock. 826 */ 827 828 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 829 { 830 struct net_device *dev; 831 struct hlist_head *head = dev_index_hash(net, ifindex); 832 833 hlist_for_each_entry_rcu(dev, head, index_hlist) 834 if (dev->ifindex == ifindex) 835 return dev; 836 837 return NULL; 838 } 839 EXPORT_SYMBOL(dev_get_by_index_rcu); 840 841 842 /** 843 * dev_get_by_index - find a device by its ifindex 844 * @net: the applicable net namespace 845 * @ifindex: index of device 846 * 847 * Search for an interface by index. Returns NULL if the device 848 * is not found or a pointer to the device. The device returned has 849 * had a reference added and the pointer is safe until the user calls 850 * dev_put to indicate they have finished with it. 851 */ 852 853 struct net_device *dev_get_by_index(struct net *net, int ifindex) 854 { 855 struct net_device *dev; 856 857 rcu_read_lock(); 858 dev = dev_get_by_index_rcu(net, ifindex); 859 if (dev) 860 dev_hold(dev); 861 rcu_read_unlock(); 862 return dev; 863 } 864 EXPORT_SYMBOL(dev_get_by_index); 865 866 /** 867 * netdev_get_name - get a netdevice name, knowing its ifindex. 868 * @net: network namespace 869 * @name: a pointer to the buffer where the name will be stored. 870 * @ifindex: the ifindex of the interface to get the name from. 871 * 872 * The use of raw_seqcount_begin() and cond_resched() before 873 * retrying is required as we want to give the writers a chance 874 * to complete when CONFIG_PREEMPT is not set. 875 */ 876 int netdev_get_name(struct net *net, char *name, int ifindex) 877 { 878 struct net_device *dev; 879 unsigned int seq; 880 881 retry: 882 seq = raw_seqcount_begin(&devnet_rename_seq); 883 rcu_read_lock(); 884 dev = dev_get_by_index_rcu(net, ifindex); 885 if (!dev) { 886 rcu_read_unlock(); 887 return -ENODEV; 888 } 889 890 strcpy(name, dev->name); 891 rcu_read_unlock(); 892 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 893 cond_resched(); 894 goto retry; 895 } 896 897 return 0; 898 } 899 900 /** 901 * dev_getbyhwaddr_rcu - find a device by its hardware address 902 * @net: the applicable net namespace 903 * @type: media type of device 904 * @ha: hardware address 905 * 906 * Search for an interface by MAC address. Returns NULL if the device 907 * is not found or a pointer to the device. 908 * The caller must hold RCU or RTNL. 909 * The returned device has not had its ref count increased 910 * and the caller must therefore be careful about locking 911 * 912 */ 913 914 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 915 const char *ha) 916 { 917 struct net_device *dev; 918 919 for_each_netdev_rcu(net, dev) 920 if (dev->type == type && 921 !memcmp(dev->dev_addr, ha, dev->addr_len)) 922 return dev; 923 924 return NULL; 925 } 926 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 927 928 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 929 { 930 struct net_device *dev; 931 932 ASSERT_RTNL(); 933 for_each_netdev(net, dev) 934 if (dev->type == type) 935 return dev; 936 937 return NULL; 938 } 939 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 940 941 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 942 { 943 struct net_device *dev, *ret = NULL; 944 945 rcu_read_lock(); 946 for_each_netdev_rcu(net, dev) 947 if (dev->type == type) { 948 dev_hold(dev); 949 ret = dev; 950 break; 951 } 952 rcu_read_unlock(); 953 return ret; 954 } 955 EXPORT_SYMBOL(dev_getfirstbyhwtype); 956 957 /** 958 * __dev_get_by_flags - find any device with given flags 959 * @net: the applicable net namespace 960 * @if_flags: IFF_* values 961 * @mask: bitmask of bits in if_flags to check 962 * 963 * Search for any interface with the given flags. Returns NULL if a device 964 * is not found or a pointer to the device. Must be called inside 965 * rtnl_lock(), and result refcount is unchanged. 966 */ 967 968 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 969 unsigned short mask) 970 { 971 struct net_device *dev, *ret; 972 973 ASSERT_RTNL(); 974 975 ret = NULL; 976 for_each_netdev(net, dev) { 977 if (((dev->flags ^ if_flags) & mask) == 0) { 978 ret = dev; 979 break; 980 } 981 } 982 return ret; 983 } 984 EXPORT_SYMBOL(__dev_get_by_flags); 985 986 /** 987 * dev_valid_name - check if name is okay for network device 988 * @name: name string 989 * 990 * Network device names need to be valid file names to 991 * to allow sysfs to work. We also disallow any kind of 992 * whitespace. 993 */ 994 bool dev_valid_name(const char *name) 995 { 996 if (*name == '\0') 997 return false; 998 if (strlen(name) >= IFNAMSIZ) 999 return false; 1000 if (!strcmp(name, ".") || !strcmp(name, "..")) 1001 return false; 1002 1003 while (*name) { 1004 if (*name == '/' || *name == ':' || isspace(*name)) 1005 return false; 1006 name++; 1007 } 1008 return true; 1009 } 1010 EXPORT_SYMBOL(dev_valid_name); 1011 1012 /** 1013 * __dev_alloc_name - allocate a name for a device 1014 * @net: network namespace to allocate the device name in 1015 * @name: name format string 1016 * @buf: scratch buffer and result name string 1017 * 1018 * Passed a format string - eg "lt%d" it will try and find a suitable 1019 * id. It scans list of devices to build up a free map, then chooses 1020 * the first empty slot. The caller must hold the dev_base or rtnl lock 1021 * while allocating the name and adding the device in order to avoid 1022 * duplicates. 1023 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1024 * Returns the number of the unit assigned or a negative errno code. 1025 */ 1026 1027 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 1028 { 1029 int i = 0; 1030 const char *p; 1031 const int max_netdevices = 8*PAGE_SIZE; 1032 unsigned long *inuse; 1033 struct net_device *d; 1034 1035 p = strnchr(name, IFNAMSIZ-1, '%'); 1036 if (p) { 1037 /* 1038 * Verify the string as this thing may have come from 1039 * the user. There must be either one "%d" and no other "%" 1040 * characters. 1041 */ 1042 if (p[1] != 'd' || strchr(p + 2, '%')) 1043 return -EINVAL; 1044 1045 /* Use one page as a bit array of possible slots */ 1046 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 1047 if (!inuse) 1048 return -ENOMEM; 1049 1050 for_each_netdev(net, d) { 1051 if (!sscanf(d->name, name, &i)) 1052 continue; 1053 if (i < 0 || i >= max_netdevices) 1054 continue; 1055 1056 /* avoid cases where sscanf is not exact inverse of printf */ 1057 snprintf(buf, IFNAMSIZ, name, i); 1058 if (!strncmp(buf, d->name, IFNAMSIZ)) 1059 set_bit(i, inuse); 1060 } 1061 1062 i = find_first_zero_bit(inuse, max_netdevices); 1063 free_page((unsigned long) inuse); 1064 } 1065 1066 if (buf != name) 1067 snprintf(buf, IFNAMSIZ, name, i); 1068 if (!__dev_get_by_name(net, buf)) 1069 return i; 1070 1071 /* It is possible to run out of possible slots 1072 * when the name is long and there isn't enough space left 1073 * for the digits, or if all bits are used. 1074 */ 1075 return -ENFILE; 1076 } 1077 1078 /** 1079 * dev_alloc_name - allocate a name for a device 1080 * @dev: device 1081 * @name: name format string 1082 * 1083 * Passed a format string - eg "lt%d" it will try and find a suitable 1084 * id. It scans list of devices to build up a free map, then chooses 1085 * the first empty slot. The caller must hold the dev_base or rtnl lock 1086 * while allocating the name and adding the device in order to avoid 1087 * duplicates. 1088 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1089 * Returns the number of the unit assigned or a negative errno code. 1090 */ 1091 1092 int dev_alloc_name(struct net_device *dev, const char *name) 1093 { 1094 char buf[IFNAMSIZ]; 1095 struct net *net; 1096 int ret; 1097 1098 BUG_ON(!dev_net(dev)); 1099 net = dev_net(dev); 1100 ret = __dev_alloc_name(net, name, buf); 1101 if (ret >= 0) 1102 strlcpy(dev->name, buf, IFNAMSIZ); 1103 return ret; 1104 } 1105 EXPORT_SYMBOL(dev_alloc_name); 1106 1107 static int dev_alloc_name_ns(struct net *net, 1108 struct net_device *dev, 1109 const char *name) 1110 { 1111 char buf[IFNAMSIZ]; 1112 int ret; 1113 1114 ret = __dev_alloc_name(net, name, buf); 1115 if (ret >= 0) 1116 strlcpy(dev->name, buf, IFNAMSIZ); 1117 return ret; 1118 } 1119 1120 static int dev_get_valid_name(struct net *net, 1121 struct net_device *dev, 1122 const char *name) 1123 { 1124 BUG_ON(!net); 1125 1126 if (!dev_valid_name(name)) 1127 return -EINVAL; 1128 1129 if (strchr(name, '%')) 1130 return dev_alloc_name_ns(net, dev, name); 1131 else if (__dev_get_by_name(net, name)) 1132 return -EEXIST; 1133 else if (dev->name != name) 1134 strlcpy(dev->name, name, IFNAMSIZ); 1135 1136 return 0; 1137 } 1138 1139 /** 1140 * dev_change_name - change name of a device 1141 * @dev: device 1142 * @newname: name (or format string) must be at least IFNAMSIZ 1143 * 1144 * Change name of a device, can pass format strings "eth%d". 1145 * for wildcarding. 1146 */ 1147 int dev_change_name(struct net_device *dev, const char *newname) 1148 { 1149 unsigned char old_assign_type; 1150 char oldname[IFNAMSIZ]; 1151 int err = 0; 1152 int ret; 1153 struct net *net; 1154 1155 ASSERT_RTNL(); 1156 BUG_ON(!dev_net(dev)); 1157 1158 net = dev_net(dev); 1159 if (dev->flags & IFF_UP) 1160 return -EBUSY; 1161 1162 write_seqcount_begin(&devnet_rename_seq); 1163 1164 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1165 write_seqcount_end(&devnet_rename_seq); 1166 return 0; 1167 } 1168 1169 memcpy(oldname, dev->name, IFNAMSIZ); 1170 1171 err = dev_get_valid_name(net, dev, newname); 1172 if (err < 0) { 1173 write_seqcount_end(&devnet_rename_seq); 1174 return err; 1175 } 1176 1177 if (oldname[0] && !strchr(oldname, '%')) 1178 netdev_info(dev, "renamed from %s\n", oldname); 1179 1180 old_assign_type = dev->name_assign_type; 1181 dev->name_assign_type = NET_NAME_RENAMED; 1182 1183 rollback: 1184 ret = device_rename(&dev->dev, dev->name); 1185 if (ret) { 1186 memcpy(dev->name, oldname, IFNAMSIZ); 1187 dev->name_assign_type = old_assign_type; 1188 write_seqcount_end(&devnet_rename_seq); 1189 return ret; 1190 } 1191 1192 write_seqcount_end(&devnet_rename_seq); 1193 1194 netdev_adjacent_rename_links(dev, oldname); 1195 1196 write_lock_bh(&dev_base_lock); 1197 hlist_del_rcu(&dev->name_hlist); 1198 write_unlock_bh(&dev_base_lock); 1199 1200 synchronize_rcu(); 1201 1202 write_lock_bh(&dev_base_lock); 1203 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1204 write_unlock_bh(&dev_base_lock); 1205 1206 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1207 ret = notifier_to_errno(ret); 1208 1209 if (ret) { 1210 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1211 if (err >= 0) { 1212 err = ret; 1213 write_seqcount_begin(&devnet_rename_seq); 1214 memcpy(dev->name, oldname, IFNAMSIZ); 1215 memcpy(oldname, newname, IFNAMSIZ); 1216 dev->name_assign_type = old_assign_type; 1217 old_assign_type = NET_NAME_RENAMED; 1218 goto rollback; 1219 } else { 1220 pr_err("%s: name change rollback failed: %d\n", 1221 dev->name, ret); 1222 } 1223 } 1224 1225 return err; 1226 } 1227 1228 /** 1229 * dev_set_alias - change ifalias of a device 1230 * @dev: device 1231 * @alias: name up to IFALIASZ 1232 * @len: limit of bytes to copy from info 1233 * 1234 * Set ifalias for a device, 1235 */ 1236 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1237 { 1238 char *new_ifalias; 1239 1240 ASSERT_RTNL(); 1241 1242 if (len >= IFALIASZ) 1243 return -EINVAL; 1244 1245 if (!len) { 1246 kfree(dev->ifalias); 1247 dev->ifalias = NULL; 1248 return 0; 1249 } 1250 1251 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1252 if (!new_ifalias) 1253 return -ENOMEM; 1254 dev->ifalias = new_ifalias; 1255 1256 strlcpy(dev->ifalias, alias, len+1); 1257 return len; 1258 } 1259 1260 1261 /** 1262 * netdev_features_change - device changes features 1263 * @dev: device to cause notification 1264 * 1265 * Called to indicate a device has changed features. 1266 */ 1267 void netdev_features_change(struct net_device *dev) 1268 { 1269 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1270 } 1271 EXPORT_SYMBOL(netdev_features_change); 1272 1273 /** 1274 * netdev_state_change - device changes state 1275 * @dev: device to cause notification 1276 * 1277 * Called to indicate a device has changed state. This function calls 1278 * the notifier chains for netdev_chain and sends a NEWLINK message 1279 * to the routing socket. 1280 */ 1281 void netdev_state_change(struct net_device *dev) 1282 { 1283 if (dev->flags & IFF_UP) { 1284 struct netdev_notifier_change_info change_info; 1285 1286 change_info.flags_changed = 0; 1287 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1288 &change_info.info); 1289 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1290 } 1291 } 1292 EXPORT_SYMBOL(netdev_state_change); 1293 1294 /** 1295 * netdev_notify_peers - notify network peers about existence of @dev 1296 * @dev: network device 1297 * 1298 * Generate traffic such that interested network peers are aware of 1299 * @dev, such as by generating a gratuitous ARP. This may be used when 1300 * a device wants to inform the rest of the network about some sort of 1301 * reconfiguration such as a failover event or virtual machine 1302 * migration. 1303 */ 1304 void netdev_notify_peers(struct net_device *dev) 1305 { 1306 rtnl_lock(); 1307 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1308 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); 1309 rtnl_unlock(); 1310 } 1311 EXPORT_SYMBOL(netdev_notify_peers); 1312 1313 static int __dev_open(struct net_device *dev) 1314 { 1315 const struct net_device_ops *ops = dev->netdev_ops; 1316 int ret; 1317 1318 ASSERT_RTNL(); 1319 1320 if (!netif_device_present(dev)) 1321 return -ENODEV; 1322 1323 /* Block netpoll from trying to do any rx path servicing. 1324 * If we don't do this there is a chance ndo_poll_controller 1325 * or ndo_poll may be running while we open the device 1326 */ 1327 netpoll_poll_disable(dev); 1328 1329 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1330 ret = notifier_to_errno(ret); 1331 if (ret) 1332 return ret; 1333 1334 set_bit(__LINK_STATE_START, &dev->state); 1335 1336 if (ops->ndo_validate_addr) 1337 ret = ops->ndo_validate_addr(dev); 1338 1339 if (!ret && ops->ndo_open) 1340 ret = ops->ndo_open(dev); 1341 1342 netpoll_poll_enable(dev); 1343 1344 if (ret) 1345 clear_bit(__LINK_STATE_START, &dev->state); 1346 else { 1347 dev->flags |= IFF_UP; 1348 dev_set_rx_mode(dev); 1349 dev_activate(dev); 1350 add_device_randomness(dev->dev_addr, dev->addr_len); 1351 } 1352 1353 return ret; 1354 } 1355 1356 /** 1357 * dev_open - prepare an interface for use. 1358 * @dev: device to open 1359 * 1360 * Takes a device from down to up state. The device's private open 1361 * function is invoked and then the multicast lists are loaded. Finally 1362 * the device is moved into the up state and a %NETDEV_UP message is 1363 * sent to the netdev notifier chain. 1364 * 1365 * Calling this function on an active interface is a nop. On a failure 1366 * a negative errno code is returned. 1367 */ 1368 int dev_open(struct net_device *dev) 1369 { 1370 int ret; 1371 1372 if (dev->flags & IFF_UP) 1373 return 0; 1374 1375 ret = __dev_open(dev); 1376 if (ret < 0) 1377 return ret; 1378 1379 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1380 call_netdevice_notifiers(NETDEV_UP, dev); 1381 1382 return ret; 1383 } 1384 EXPORT_SYMBOL(dev_open); 1385 1386 static int __dev_close_many(struct list_head *head) 1387 { 1388 struct net_device *dev; 1389 1390 ASSERT_RTNL(); 1391 might_sleep(); 1392 1393 list_for_each_entry(dev, head, close_list) { 1394 /* Temporarily disable netpoll until the interface is down */ 1395 netpoll_poll_disable(dev); 1396 1397 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1398 1399 clear_bit(__LINK_STATE_START, &dev->state); 1400 1401 /* Synchronize to scheduled poll. We cannot touch poll list, it 1402 * can be even on different cpu. So just clear netif_running(). 1403 * 1404 * dev->stop() will invoke napi_disable() on all of it's 1405 * napi_struct instances on this device. 1406 */ 1407 smp_mb__after_atomic(); /* Commit netif_running(). */ 1408 } 1409 1410 dev_deactivate_many(head); 1411 1412 list_for_each_entry(dev, head, close_list) { 1413 const struct net_device_ops *ops = dev->netdev_ops; 1414 1415 /* 1416 * Call the device specific close. This cannot fail. 1417 * Only if device is UP 1418 * 1419 * We allow it to be called even after a DETACH hot-plug 1420 * event. 1421 */ 1422 if (ops->ndo_stop) 1423 ops->ndo_stop(dev); 1424 1425 dev->flags &= ~IFF_UP; 1426 netpoll_poll_enable(dev); 1427 } 1428 1429 return 0; 1430 } 1431 1432 static int __dev_close(struct net_device *dev) 1433 { 1434 int retval; 1435 LIST_HEAD(single); 1436 1437 list_add(&dev->close_list, &single); 1438 retval = __dev_close_many(&single); 1439 list_del(&single); 1440 1441 return retval; 1442 } 1443 1444 int dev_close_many(struct list_head *head, bool unlink) 1445 { 1446 struct net_device *dev, *tmp; 1447 1448 /* Remove the devices that don't need to be closed */ 1449 list_for_each_entry_safe(dev, tmp, head, close_list) 1450 if (!(dev->flags & IFF_UP)) 1451 list_del_init(&dev->close_list); 1452 1453 __dev_close_many(head); 1454 1455 list_for_each_entry_safe(dev, tmp, head, close_list) { 1456 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1457 call_netdevice_notifiers(NETDEV_DOWN, dev); 1458 if (unlink) 1459 list_del_init(&dev->close_list); 1460 } 1461 1462 return 0; 1463 } 1464 EXPORT_SYMBOL(dev_close_many); 1465 1466 /** 1467 * dev_close - shutdown an interface. 1468 * @dev: device to shutdown 1469 * 1470 * This function moves an active device into down state. A 1471 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1472 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1473 * chain. 1474 */ 1475 int dev_close(struct net_device *dev) 1476 { 1477 if (dev->flags & IFF_UP) { 1478 LIST_HEAD(single); 1479 1480 list_add(&dev->close_list, &single); 1481 dev_close_many(&single, true); 1482 list_del(&single); 1483 } 1484 return 0; 1485 } 1486 EXPORT_SYMBOL(dev_close); 1487 1488 1489 /** 1490 * dev_disable_lro - disable Large Receive Offload on a device 1491 * @dev: device 1492 * 1493 * Disable Large Receive Offload (LRO) on a net device. Must be 1494 * called under RTNL. This is needed if received packets may be 1495 * forwarded to another interface. 1496 */ 1497 void dev_disable_lro(struct net_device *dev) 1498 { 1499 struct net_device *lower_dev; 1500 struct list_head *iter; 1501 1502 dev->wanted_features &= ~NETIF_F_LRO; 1503 netdev_update_features(dev); 1504 1505 if (unlikely(dev->features & NETIF_F_LRO)) 1506 netdev_WARN(dev, "failed to disable LRO!\n"); 1507 1508 netdev_for_each_lower_dev(dev, lower_dev, iter) 1509 dev_disable_lro(lower_dev); 1510 } 1511 EXPORT_SYMBOL(dev_disable_lro); 1512 1513 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1514 struct net_device *dev) 1515 { 1516 struct netdev_notifier_info info; 1517 1518 netdev_notifier_info_init(&info, dev); 1519 return nb->notifier_call(nb, val, &info); 1520 } 1521 1522 static int dev_boot_phase = 1; 1523 1524 /** 1525 * register_netdevice_notifier - register a network notifier block 1526 * @nb: notifier 1527 * 1528 * Register a notifier to be called when network device events occur. 1529 * The notifier passed is linked into the kernel structures and must 1530 * not be reused until it has been unregistered. A negative errno code 1531 * is returned on a failure. 1532 * 1533 * When registered all registration and up events are replayed 1534 * to the new notifier to allow device to have a race free 1535 * view of the network device list. 1536 */ 1537 1538 int register_netdevice_notifier(struct notifier_block *nb) 1539 { 1540 struct net_device *dev; 1541 struct net_device *last; 1542 struct net *net; 1543 int err; 1544 1545 rtnl_lock(); 1546 err = raw_notifier_chain_register(&netdev_chain, nb); 1547 if (err) 1548 goto unlock; 1549 if (dev_boot_phase) 1550 goto unlock; 1551 for_each_net(net) { 1552 for_each_netdev(net, dev) { 1553 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1554 err = notifier_to_errno(err); 1555 if (err) 1556 goto rollback; 1557 1558 if (!(dev->flags & IFF_UP)) 1559 continue; 1560 1561 call_netdevice_notifier(nb, NETDEV_UP, dev); 1562 } 1563 } 1564 1565 unlock: 1566 rtnl_unlock(); 1567 return err; 1568 1569 rollback: 1570 last = dev; 1571 for_each_net(net) { 1572 for_each_netdev(net, dev) { 1573 if (dev == last) 1574 goto outroll; 1575 1576 if (dev->flags & IFF_UP) { 1577 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1578 dev); 1579 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1580 } 1581 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1582 } 1583 } 1584 1585 outroll: 1586 raw_notifier_chain_unregister(&netdev_chain, nb); 1587 goto unlock; 1588 } 1589 EXPORT_SYMBOL(register_netdevice_notifier); 1590 1591 /** 1592 * unregister_netdevice_notifier - unregister a network notifier block 1593 * @nb: notifier 1594 * 1595 * Unregister a notifier previously registered by 1596 * register_netdevice_notifier(). The notifier is unlinked into the 1597 * kernel structures and may then be reused. A negative errno code 1598 * is returned on a failure. 1599 * 1600 * After unregistering unregister and down device events are synthesized 1601 * for all devices on the device list to the removed notifier to remove 1602 * the need for special case cleanup code. 1603 */ 1604 1605 int unregister_netdevice_notifier(struct notifier_block *nb) 1606 { 1607 struct net_device *dev; 1608 struct net *net; 1609 int err; 1610 1611 rtnl_lock(); 1612 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1613 if (err) 1614 goto unlock; 1615 1616 for_each_net(net) { 1617 for_each_netdev(net, dev) { 1618 if (dev->flags & IFF_UP) { 1619 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1620 dev); 1621 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1622 } 1623 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1624 } 1625 } 1626 unlock: 1627 rtnl_unlock(); 1628 return err; 1629 } 1630 EXPORT_SYMBOL(unregister_netdevice_notifier); 1631 1632 /** 1633 * call_netdevice_notifiers_info - call all network notifier blocks 1634 * @val: value passed unmodified to notifier function 1635 * @dev: net_device pointer passed unmodified to notifier function 1636 * @info: notifier information data 1637 * 1638 * Call all network notifier blocks. Parameters and return value 1639 * are as for raw_notifier_call_chain(). 1640 */ 1641 1642 static int call_netdevice_notifiers_info(unsigned long val, 1643 struct net_device *dev, 1644 struct netdev_notifier_info *info) 1645 { 1646 ASSERT_RTNL(); 1647 netdev_notifier_info_init(info, dev); 1648 return raw_notifier_call_chain(&netdev_chain, val, info); 1649 } 1650 1651 /** 1652 * call_netdevice_notifiers - call all network notifier blocks 1653 * @val: value passed unmodified to notifier function 1654 * @dev: net_device pointer passed unmodified to notifier function 1655 * 1656 * Call all network notifier blocks. Parameters and return value 1657 * are as for raw_notifier_call_chain(). 1658 */ 1659 1660 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1661 { 1662 struct netdev_notifier_info info; 1663 1664 return call_netdevice_notifiers_info(val, dev, &info); 1665 } 1666 EXPORT_SYMBOL(call_netdevice_notifiers); 1667 1668 #ifdef CONFIG_NET_INGRESS 1669 static struct static_key ingress_needed __read_mostly; 1670 1671 void net_inc_ingress_queue(void) 1672 { 1673 static_key_slow_inc(&ingress_needed); 1674 } 1675 EXPORT_SYMBOL_GPL(net_inc_ingress_queue); 1676 1677 void net_dec_ingress_queue(void) 1678 { 1679 static_key_slow_dec(&ingress_needed); 1680 } 1681 EXPORT_SYMBOL_GPL(net_dec_ingress_queue); 1682 #endif 1683 1684 #ifdef CONFIG_NET_EGRESS 1685 static struct static_key egress_needed __read_mostly; 1686 1687 void net_inc_egress_queue(void) 1688 { 1689 static_key_slow_inc(&egress_needed); 1690 } 1691 EXPORT_SYMBOL_GPL(net_inc_egress_queue); 1692 1693 void net_dec_egress_queue(void) 1694 { 1695 static_key_slow_dec(&egress_needed); 1696 } 1697 EXPORT_SYMBOL_GPL(net_dec_egress_queue); 1698 #endif 1699 1700 static struct static_key netstamp_needed __read_mostly; 1701 #ifdef HAVE_JUMP_LABEL 1702 static atomic_t netstamp_needed_deferred; 1703 static atomic_t netstamp_wanted; 1704 static void netstamp_clear(struct work_struct *work) 1705 { 1706 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1707 int wanted; 1708 1709 wanted = atomic_add_return(deferred, &netstamp_wanted); 1710 if (wanted > 0) 1711 static_key_enable(&netstamp_needed); 1712 else 1713 static_key_disable(&netstamp_needed); 1714 } 1715 static DECLARE_WORK(netstamp_work, netstamp_clear); 1716 #endif 1717 1718 void net_enable_timestamp(void) 1719 { 1720 #ifdef HAVE_JUMP_LABEL 1721 int wanted; 1722 1723 while (1) { 1724 wanted = atomic_read(&netstamp_wanted); 1725 if (wanted <= 0) 1726 break; 1727 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) 1728 return; 1729 } 1730 atomic_inc(&netstamp_needed_deferred); 1731 schedule_work(&netstamp_work); 1732 #else 1733 static_key_slow_inc(&netstamp_needed); 1734 #endif 1735 } 1736 EXPORT_SYMBOL(net_enable_timestamp); 1737 1738 void net_disable_timestamp(void) 1739 { 1740 #ifdef HAVE_JUMP_LABEL 1741 int wanted; 1742 1743 while (1) { 1744 wanted = atomic_read(&netstamp_wanted); 1745 if (wanted <= 1) 1746 break; 1747 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) 1748 return; 1749 } 1750 atomic_dec(&netstamp_needed_deferred); 1751 schedule_work(&netstamp_work); 1752 #else 1753 static_key_slow_dec(&netstamp_needed); 1754 #endif 1755 } 1756 EXPORT_SYMBOL(net_disable_timestamp); 1757 1758 static inline void net_timestamp_set(struct sk_buff *skb) 1759 { 1760 skb->tstamp = 0; 1761 if (static_key_false(&netstamp_needed)) 1762 __net_timestamp(skb); 1763 } 1764 1765 #define net_timestamp_check(COND, SKB) \ 1766 if (static_key_false(&netstamp_needed)) { \ 1767 if ((COND) && !(SKB)->tstamp) \ 1768 __net_timestamp(SKB); \ 1769 } \ 1770 1771 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) 1772 { 1773 unsigned int len; 1774 1775 if (!(dev->flags & IFF_UP)) 1776 return false; 1777 1778 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1779 if (skb->len <= len) 1780 return true; 1781 1782 /* if TSO is enabled, we don't care about the length as the packet 1783 * could be forwarded without being segmented before 1784 */ 1785 if (skb_is_gso(skb)) 1786 return true; 1787 1788 return false; 1789 } 1790 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1791 1792 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1793 { 1794 int ret = ____dev_forward_skb(dev, skb); 1795 1796 if (likely(!ret)) { 1797 skb->protocol = eth_type_trans(skb, dev); 1798 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1799 } 1800 1801 return ret; 1802 } 1803 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1804 1805 /** 1806 * dev_forward_skb - loopback an skb to another netif 1807 * 1808 * @dev: destination network device 1809 * @skb: buffer to forward 1810 * 1811 * return values: 1812 * NET_RX_SUCCESS (no congestion) 1813 * NET_RX_DROP (packet was dropped, but freed) 1814 * 1815 * dev_forward_skb can be used for injecting an skb from the 1816 * start_xmit function of one device into the receive queue 1817 * of another device. 1818 * 1819 * The receiving device may be in another namespace, so 1820 * we have to clear all information in the skb that could 1821 * impact namespace isolation. 1822 */ 1823 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1824 { 1825 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1826 } 1827 EXPORT_SYMBOL_GPL(dev_forward_skb); 1828 1829 static inline int deliver_skb(struct sk_buff *skb, 1830 struct packet_type *pt_prev, 1831 struct net_device *orig_dev) 1832 { 1833 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1834 return -ENOMEM; 1835 atomic_inc(&skb->users); 1836 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1837 } 1838 1839 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1840 struct packet_type **pt, 1841 struct net_device *orig_dev, 1842 __be16 type, 1843 struct list_head *ptype_list) 1844 { 1845 struct packet_type *ptype, *pt_prev = *pt; 1846 1847 list_for_each_entry_rcu(ptype, ptype_list, list) { 1848 if (ptype->type != type) 1849 continue; 1850 if (pt_prev) 1851 deliver_skb(skb, pt_prev, orig_dev); 1852 pt_prev = ptype; 1853 } 1854 *pt = pt_prev; 1855 } 1856 1857 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1858 { 1859 if (!ptype->af_packet_priv || !skb->sk) 1860 return false; 1861 1862 if (ptype->id_match) 1863 return ptype->id_match(ptype, skb->sk); 1864 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1865 return true; 1866 1867 return false; 1868 } 1869 1870 /* 1871 * Support routine. Sends outgoing frames to any network 1872 * taps currently in use. 1873 */ 1874 1875 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1876 { 1877 struct packet_type *ptype; 1878 struct sk_buff *skb2 = NULL; 1879 struct packet_type *pt_prev = NULL; 1880 struct list_head *ptype_list = &ptype_all; 1881 1882 rcu_read_lock(); 1883 again: 1884 list_for_each_entry_rcu(ptype, ptype_list, list) { 1885 /* Never send packets back to the socket 1886 * they originated from - MvS (miquels@drinkel.ow.org) 1887 */ 1888 if (skb_loop_sk(ptype, skb)) 1889 continue; 1890 1891 if (pt_prev) { 1892 deliver_skb(skb2, pt_prev, skb->dev); 1893 pt_prev = ptype; 1894 continue; 1895 } 1896 1897 /* need to clone skb, done only once */ 1898 skb2 = skb_clone(skb, GFP_ATOMIC); 1899 if (!skb2) 1900 goto out_unlock; 1901 1902 net_timestamp_set(skb2); 1903 1904 /* skb->nh should be correctly 1905 * set by sender, so that the second statement is 1906 * just protection against buggy protocols. 1907 */ 1908 skb_reset_mac_header(skb2); 1909 1910 if (skb_network_header(skb2) < skb2->data || 1911 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1912 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1913 ntohs(skb2->protocol), 1914 dev->name); 1915 skb_reset_network_header(skb2); 1916 } 1917 1918 skb2->transport_header = skb2->network_header; 1919 skb2->pkt_type = PACKET_OUTGOING; 1920 pt_prev = ptype; 1921 } 1922 1923 if (ptype_list == &ptype_all) { 1924 ptype_list = &dev->ptype_all; 1925 goto again; 1926 } 1927 out_unlock: 1928 if (pt_prev) 1929 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1930 rcu_read_unlock(); 1931 } 1932 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); 1933 1934 /** 1935 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1936 * @dev: Network device 1937 * @txq: number of queues available 1938 * 1939 * If real_num_tx_queues is changed the tc mappings may no longer be 1940 * valid. To resolve this verify the tc mapping remains valid and if 1941 * not NULL the mapping. With no priorities mapping to this 1942 * offset/count pair it will no longer be used. In the worst case TC0 1943 * is invalid nothing can be done so disable priority mappings. If is 1944 * expected that drivers will fix this mapping if they can before 1945 * calling netif_set_real_num_tx_queues. 1946 */ 1947 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1948 { 1949 int i; 1950 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1951 1952 /* If TC0 is invalidated disable TC mapping */ 1953 if (tc->offset + tc->count > txq) { 1954 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1955 dev->num_tc = 0; 1956 return; 1957 } 1958 1959 /* Invalidated prio to tc mappings set to TC0 */ 1960 for (i = 1; i < TC_BITMASK + 1; i++) { 1961 int q = netdev_get_prio_tc_map(dev, i); 1962 1963 tc = &dev->tc_to_txq[q]; 1964 if (tc->offset + tc->count > txq) { 1965 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1966 i, q); 1967 netdev_set_prio_tc_map(dev, i, 0); 1968 } 1969 } 1970 } 1971 1972 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) 1973 { 1974 if (dev->num_tc) { 1975 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1976 int i; 1977 1978 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { 1979 if ((txq - tc->offset) < tc->count) 1980 return i; 1981 } 1982 1983 return -1; 1984 } 1985 1986 return 0; 1987 } 1988 1989 #ifdef CONFIG_XPS 1990 static DEFINE_MUTEX(xps_map_mutex); 1991 #define xmap_dereference(P) \ 1992 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1993 1994 static bool remove_xps_queue(struct xps_dev_maps *dev_maps, 1995 int tci, u16 index) 1996 { 1997 struct xps_map *map = NULL; 1998 int pos; 1999 2000 if (dev_maps) 2001 map = xmap_dereference(dev_maps->cpu_map[tci]); 2002 if (!map) 2003 return false; 2004 2005 for (pos = map->len; pos--;) { 2006 if (map->queues[pos] != index) 2007 continue; 2008 2009 if (map->len > 1) { 2010 map->queues[pos] = map->queues[--map->len]; 2011 break; 2012 } 2013 2014 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); 2015 kfree_rcu(map, rcu); 2016 return false; 2017 } 2018 2019 return true; 2020 } 2021 2022 static bool remove_xps_queue_cpu(struct net_device *dev, 2023 struct xps_dev_maps *dev_maps, 2024 int cpu, u16 offset, u16 count) 2025 { 2026 int num_tc = dev->num_tc ? : 1; 2027 bool active = false; 2028 int tci; 2029 2030 for (tci = cpu * num_tc; num_tc--; tci++) { 2031 int i, j; 2032 2033 for (i = count, j = offset; i--; j++) { 2034 if (!remove_xps_queue(dev_maps, cpu, j)) 2035 break; 2036 } 2037 2038 active |= i < 0; 2039 } 2040 2041 return active; 2042 } 2043 2044 static void netif_reset_xps_queues(struct net_device *dev, u16 offset, 2045 u16 count) 2046 { 2047 struct xps_dev_maps *dev_maps; 2048 int cpu, i; 2049 bool active = false; 2050 2051 mutex_lock(&xps_map_mutex); 2052 dev_maps = xmap_dereference(dev->xps_maps); 2053 2054 if (!dev_maps) 2055 goto out_no_maps; 2056 2057 for_each_possible_cpu(cpu) 2058 active |= remove_xps_queue_cpu(dev, dev_maps, cpu, 2059 offset, count); 2060 2061 if (!active) { 2062 RCU_INIT_POINTER(dev->xps_maps, NULL); 2063 kfree_rcu(dev_maps, rcu); 2064 } 2065 2066 for (i = offset + (count - 1); count--; i--) 2067 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 2068 NUMA_NO_NODE); 2069 2070 out_no_maps: 2071 mutex_unlock(&xps_map_mutex); 2072 } 2073 2074 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 2075 { 2076 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); 2077 } 2078 2079 static struct xps_map *expand_xps_map(struct xps_map *map, 2080 int cpu, u16 index) 2081 { 2082 struct xps_map *new_map; 2083 int alloc_len = XPS_MIN_MAP_ALLOC; 2084 int i, pos; 2085 2086 for (pos = 0; map && pos < map->len; pos++) { 2087 if (map->queues[pos] != index) 2088 continue; 2089 return map; 2090 } 2091 2092 /* Need to add queue to this CPU's existing map */ 2093 if (map) { 2094 if (pos < map->alloc_len) 2095 return map; 2096 2097 alloc_len = map->alloc_len * 2; 2098 } 2099 2100 /* Need to allocate new map to store queue on this CPU's map */ 2101 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 2102 cpu_to_node(cpu)); 2103 if (!new_map) 2104 return NULL; 2105 2106 for (i = 0; i < pos; i++) 2107 new_map->queues[i] = map->queues[i]; 2108 new_map->alloc_len = alloc_len; 2109 new_map->len = pos; 2110 2111 return new_map; 2112 } 2113 2114 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 2115 u16 index) 2116 { 2117 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2118 int i, cpu, tci, numa_node_id = -2; 2119 int maps_sz, num_tc = 1, tc = 0; 2120 struct xps_map *map, *new_map; 2121 bool active = false; 2122 2123 if (dev->num_tc) { 2124 num_tc = dev->num_tc; 2125 tc = netdev_txq_to_tc(dev, index); 2126 if (tc < 0) 2127 return -EINVAL; 2128 } 2129 2130 maps_sz = XPS_DEV_MAPS_SIZE(num_tc); 2131 if (maps_sz < L1_CACHE_BYTES) 2132 maps_sz = L1_CACHE_BYTES; 2133 2134 mutex_lock(&xps_map_mutex); 2135 2136 dev_maps = xmap_dereference(dev->xps_maps); 2137 2138 /* allocate memory for queue storage */ 2139 for_each_cpu_and(cpu, cpu_online_mask, mask) { 2140 if (!new_dev_maps) 2141 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2142 if (!new_dev_maps) { 2143 mutex_unlock(&xps_map_mutex); 2144 return -ENOMEM; 2145 } 2146 2147 tci = cpu * num_tc + tc; 2148 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : 2149 NULL; 2150 2151 map = expand_xps_map(map, cpu, index); 2152 if (!map) 2153 goto error; 2154 2155 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2156 } 2157 2158 if (!new_dev_maps) 2159 goto out_no_new_maps; 2160 2161 for_each_possible_cpu(cpu) { 2162 /* copy maps belonging to foreign traffic classes */ 2163 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { 2164 /* fill in the new device map from the old device map */ 2165 map = xmap_dereference(dev_maps->cpu_map[tci]); 2166 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2167 } 2168 2169 /* We need to explicitly update tci as prevous loop 2170 * could break out early if dev_maps is NULL. 2171 */ 2172 tci = cpu * num_tc + tc; 2173 2174 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2175 /* add queue to CPU maps */ 2176 int pos = 0; 2177 2178 map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2179 while ((pos < map->len) && (map->queues[pos] != index)) 2180 pos++; 2181 2182 if (pos == map->len) 2183 map->queues[map->len++] = index; 2184 #ifdef CONFIG_NUMA 2185 if (numa_node_id == -2) 2186 numa_node_id = cpu_to_node(cpu); 2187 else if (numa_node_id != cpu_to_node(cpu)) 2188 numa_node_id = -1; 2189 #endif 2190 } else if (dev_maps) { 2191 /* fill in the new device map from the old device map */ 2192 map = xmap_dereference(dev_maps->cpu_map[tci]); 2193 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2194 } 2195 2196 /* copy maps belonging to foreign traffic classes */ 2197 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { 2198 /* fill in the new device map from the old device map */ 2199 map = xmap_dereference(dev_maps->cpu_map[tci]); 2200 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2201 } 2202 } 2203 2204 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2205 2206 /* Cleanup old maps */ 2207 if (!dev_maps) 2208 goto out_no_old_maps; 2209 2210 for_each_possible_cpu(cpu) { 2211 for (i = num_tc, tci = cpu * num_tc; i--; tci++) { 2212 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2213 map = xmap_dereference(dev_maps->cpu_map[tci]); 2214 if (map && map != new_map) 2215 kfree_rcu(map, rcu); 2216 } 2217 } 2218 2219 kfree_rcu(dev_maps, rcu); 2220 2221 out_no_old_maps: 2222 dev_maps = new_dev_maps; 2223 active = true; 2224 2225 out_no_new_maps: 2226 /* update Tx queue numa node */ 2227 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2228 (numa_node_id >= 0) ? numa_node_id : 2229 NUMA_NO_NODE); 2230 2231 if (!dev_maps) 2232 goto out_no_maps; 2233 2234 /* removes queue from unused CPUs */ 2235 for_each_possible_cpu(cpu) { 2236 for (i = tc, tci = cpu * num_tc; i--; tci++) 2237 active |= remove_xps_queue(dev_maps, tci, index); 2238 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) 2239 active |= remove_xps_queue(dev_maps, tci, index); 2240 for (i = num_tc - tc, tci++; --i; tci++) 2241 active |= remove_xps_queue(dev_maps, tci, index); 2242 } 2243 2244 /* free map if not active */ 2245 if (!active) { 2246 RCU_INIT_POINTER(dev->xps_maps, NULL); 2247 kfree_rcu(dev_maps, rcu); 2248 } 2249 2250 out_no_maps: 2251 mutex_unlock(&xps_map_mutex); 2252 2253 return 0; 2254 error: 2255 /* remove any maps that we added */ 2256 for_each_possible_cpu(cpu) { 2257 for (i = num_tc, tci = cpu * num_tc; i--; tci++) { 2258 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2259 map = dev_maps ? 2260 xmap_dereference(dev_maps->cpu_map[tci]) : 2261 NULL; 2262 if (new_map && new_map != map) 2263 kfree(new_map); 2264 } 2265 } 2266 2267 mutex_unlock(&xps_map_mutex); 2268 2269 kfree(new_dev_maps); 2270 return -ENOMEM; 2271 } 2272 EXPORT_SYMBOL(netif_set_xps_queue); 2273 2274 #endif 2275 void netdev_reset_tc(struct net_device *dev) 2276 { 2277 #ifdef CONFIG_XPS 2278 netif_reset_xps_queues_gt(dev, 0); 2279 #endif 2280 dev->num_tc = 0; 2281 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); 2282 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); 2283 } 2284 EXPORT_SYMBOL(netdev_reset_tc); 2285 2286 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) 2287 { 2288 if (tc >= dev->num_tc) 2289 return -EINVAL; 2290 2291 #ifdef CONFIG_XPS 2292 netif_reset_xps_queues(dev, offset, count); 2293 #endif 2294 dev->tc_to_txq[tc].count = count; 2295 dev->tc_to_txq[tc].offset = offset; 2296 return 0; 2297 } 2298 EXPORT_SYMBOL(netdev_set_tc_queue); 2299 2300 int netdev_set_num_tc(struct net_device *dev, u8 num_tc) 2301 { 2302 if (num_tc > TC_MAX_QUEUE) 2303 return -EINVAL; 2304 2305 #ifdef CONFIG_XPS 2306 netif_reset_xps_queues_gt(dev, 0); 2307 #endif 2308 dev->num_tc = num_tc; 2309 return 0; 2310 } 2311 EXPORT_SYMBOL(netdev_set_num_tc); 2312 2313 /* 2314 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2315 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2316 */ 2317 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2318 { 2319 int rc; 2320 2321 if (txq < 1 || txq > dev->num_tx_queues) 2322 return -EINVAL; 2323 2324 if (dev->reg_state == NETREG_REGISTERED || 2325 dev->reg_state == NETREG_UNREGISTERING) { 2326 ASSERT_RTNL(); 2327 2328 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2329 txq); 2330 if (rc) 2331 return rc; 2332 2333 if (dev->num_tc) 2334 netif_setup_tc(dev, txq); 2335 2336 if (txq < dev->real_num_tx_queues) { 2337 qdisc_reset_all_tx_gt(dev, txq); 2338 #ifdef CONFIG_XPS 2339 netif_reset_xps_queues_gt(dev, txq); 2340 #endif 2341 } 2342 } 2343 2344 dev->real_num_tx_queues = txq; 2345 return 0; 2346 } 2347 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2348 2349 #ifdef CONFIG_SYSFS 2350 /** 2351 * netif_set_real_num_rx_queues - set actual number of RX queues used 2352 * @dev: Network device 2353 * @rxq: Actual number of RX queues 2354 * 2355 * This must be called either with the rtnl_lock held or before 2356 * registration of the net device. Returns 0 on success, or a 2357 * negative error code. If called before registration, it always 2358 * succeeds. 2359 */ 2360 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2361 { 2362 int rc; 2363 2364 if (rxq < 1 || rxq > dev->num_rx_queues) 2365 return -EINVAL; 2366 2367 if (dev->reg_state == NETREG_REGISTERED) { 2368 ASSERT_RTNL(); 2369 2370 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2371 rxq); 2372 if (rc) 2373 return rc; 2374 } 2375 2376 dev->real_num_rx_queues = rxq; 2377 return 0; 2378 } 2379 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2380 #endif 2381 2382 /** 2383 * netif_get_num_default_rss_queues - default number of RSS queues 2384 * 2385 * This routine should set an upper limit on the number of RSS queues 2386 * used by default by multiqueue devices. 2387 */ 2388 int netif_get_num_default_rss_queues(void) 2389 { 2390 return is_kdump_kernel() ? 2391 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2392 } 2393 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2394 2395 static void __netif_reschedule(struct Qdisc *q) 2396 { 2397 struct softnet_data *sd; 2398 unsigned long flags; 2399 2400 local_irq_save(flags); 2401 sd = this_cpu_ptr(&softnet_data); 2402 q->next_sched = NULL; 2403 *sd->output_queue_tailp = q; 2404 sd->output_queue_tailp = &q->next_sched; 2405 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2406 local_irq_restore(flags); 2407 } 2408 2409 void __netif_schedule(struct Qdisc *q) 2410 { 2411 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2412 __netif_reschedule(q); 2413 } 2414 EXPORT_SYMBOL(__netif_schedule); 2415 2416 struct dev_kfree_skb_cb { 2417 enum skb_free_reason reason; 2418 }; 2419 2420 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2421 { 2422 return (struct dev_kfree_skb_cb *)skb->cb; 2423 } 2424 2425 void netif_schedule_queue(struct netdev_queue *txq) 2426 { 2427 rcu_read_lock(); 2428 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2429 struct Qdisc *q = rcu_dereference(txq->qdisc); 2430 2431 __netif_schedule(q); 2432 } 2433 rcu_read_unlock(); 2434 } 2435 EXPORT_SYMBOL(netif_schedule_queue); 2436 2437 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2438 { 2439 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2440 struct Qdisc *q; 2441 2442 rcu_read_lock(); 2443 q = rcu_dereference(dev_queue->qdisc); 2444 __netif_schedule(q); 2445 rcu_read_unlock(); 2446 } 2447 } 2448 EXPORT_SYMBOL(netif_tx_wake_queue); 2449 2450 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2451 { 2452 unsigned long flags; 2453 2454 if (unlikely(!skb)) 2455 return; 2456 2457 if (likely(atomic_read(&skb->users) == 1)) { 2458 smp_rmb(); 2459 atomic_set(&skb->users, 0); 2460 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2461 return; 2462 } 2463 get_kfree_skb_cb(skb)->reason = reason; 2464 local_irq_save(flags); 2465 skb->next = __this_cpu_read(softnet_data.completion_queue); 2466 __this_cpu_write(softnet_data.completion_queue, skb); 2467 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2468 local_irq_restore(flags); 2469 } 2470 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2471 2472 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2473 { 2474 if (in_irq() || irqs_disabled()) 2475 __dev_kfree_skb_irq(skb, reason); 2476 else 2477 dev_kfree_skb(skb); 2478 } 2479 EXPORT_SYMBOL(__dev_kfree_skb_any); 2480 2481 2482 /** 2483 * netif_device_detach - mark device as removed 2484 * @dev: network device 2485 * 2486 * Mark device as removed from system and therefore no longer available. 2487 */ 2488 void netif_device_detach(struct net_device *dev) 2489 { 2490 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2491 netif_running(dev)) { 2492 netif_tx_stop_all_queues(dev); 2493 } 2494 } 2495 EXPORT_SYMBOL(netif_device_detach); 2496 2497 /** 2498 * netif_device_attach - mark device as attached 2499 * @dev: network device 2500 * 2501 * Mark device as attached from system and restart if needed. 2502 */ 2503 void netif_device_attach(struct net_device *dev) 2504 { 2505 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2506 netif_running(dev)) { 2507 netif_tx_wake_all_queues(dev); 2508 __netdev_watchdog_up(dev); 2509 } 2510 } 2511 EXPORT_SYMBOL(netif_device_attach); 2512 2513 /* 2514 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2515 * to be used as a distribution range. 2516 */ 2517 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 2518 unsigned int num_tx_queues) 2519 { 2520 u32 hash; 2521 u16 qoffset = 0; 2522 u16 qcount = num_tx_queues; 2523 2524 if (skb_rx_queue_recorded(skb)) { 2525 hash = skb_get_rx_queue(skb); 2526 while (unlikely(hash >= num_tx_queues)) 2527 hash -= num_tx_queues; 2528 return hash; 2529 } 2530 2531 if (dev->num_tc) { 2532 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2533 2534 qoffset = dev->tc_to_txq[tc].offset; 2535 qcount = dev->tc_to_txq[tc].count; 2536 } 2537 2538 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2539 } 2540 EXPORT_SYMBOL(__skb_tx_hash); 2541 2542 static void skb_warn_bad_offload(const struct sk_buff *skb) 2543 { 2544 static const netdev_features_t null_features; 2545 struct net_device *dev = skb->dev; 2546 const char *name = ""; 2547 2548 if (!net_ratelimit()) 2549 return; 2550 2551 if (dev) { 2552 if (dev->dev.parent) 2553 name = dev_driver_string(dev->dev.parent); 2554 else 2555 name = netdev_name(dev); 2556 } 2557 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2558 "gso_type=%d ip_summed=%d\n", 2559 name, dev ? &dev->features : &null_features, 2560 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2561 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2562 skb_shinfo(skb)->gso_type, skb->ip_summed); 2563 } 2564 2565 /* 2566 * Invalidate hardware checksum when packet is to be mangled, and 2567 * complete checksum manually on outgoing path. 2568 */ 2569 int skb_checksum_help(struct sk_buff *skb) 2570 { 2571 __wsum csum; 2572 int ret = 0, offset; 2573 2574 if (skb->ip_summed == CHECKSUM_COMPLETE) 2575 goto out_set_summed; 2576 2577 if (unlikely(skb_shinfo(skb)->gso_size)) { 2578 skb_warn_bad_offload(skb); 2579 return -EINVAL; 2580 } 2581 2582 /* Before computing a checksum, we should make sure no frag could 2583 * be modified by an external entity : checksum could be wrong. 2584 */ 2585 if (skb_has_shared_frag(skb)) { 2586 ret = __skb_linearize(skb); 2587 if (ret) 2588 goto out; 2589 } 2590 2591 offset = skb_checksum_start_offset(skb); 2592 BUG_ON(offset >= skb_headlen(skb)); 2593 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2594 2595 offset += skb->csum_offset; 2596 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2597 2598 if (skb_cloned(skb) && 2599 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2600 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2601 if (ret) 2602 goto out; 2603 } 2604 2605 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; 2606 out_set_summed: 2607 skb->ip_summed = CHECKSUM_NONE; 2608 out: 2609 return ret; 2610 } 2611 EXPORT_SYMBOL(skb_checksum_help); 2612 2613 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2614 { 2615 __be16 type = skb->protocol; 2616 2617 /* Tunnel gso handlers can set protocol to ethernet. */ 2618 if (type == htons(ETH_P_TEB)) { 2619 struct ethhdr *eth; 2620 2621 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2622 return 0; 2623 2624 eth = (struct ethhdr *)skb_mac_header(skb); 2625 type = eth->h_proto; 2626 } 2627 2628 return __vlan_get_protocol(skb, type, depth); 2629 } 2630 2631 /** 2632 * skb_mac_gso_segment - mac layer segmentation handler. 2633 * @skb: buffer to segment 2634 * @features: features for the output path (see dev->features) 2635 */ 2636 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2637 netdev_features_t features) 2638 { 2639 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2640 struct packet_offload *ptype; 2641 int vlan_depth = skb->mac_len; 2642 __be16 type = skb_network_protocol(skb, &vlan_depth); 2643 2644 if (unlikely(!type)) 2645 return ERR_PTR(-EINVAL); 2646 2647 __skb_pull(skb, vlan_depth); 2648 2649 rcu_read_lock(); 2650 list_for_each_entry_rcu(ptype, &offload_base, list) { 2651 if (ptype->type == type && ptype->callbacks.gso_segment) { 2652 segs = ptype->callbacks.gso_segment(skb, features); 2653 break; 2654 } 2655 } 2656 rcu_read_unlock(); 2657 2658 __skb_push(skb, skb->data - skb_mac_header(skb)); 2659 2660 return segs; 2661 } 2662 EXPORT_SYMBOL(skb_mac_gso_segment); 2663 2664 2665 /* openvswitch calls this on rx path, so we need a different check. 2666 */ 2667 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2668 { 2669 if (tx_path) 2670 return skb->ip_summed != CHECKSUM_PARTIAL && 2671 skb->ip_summed != CHECKSUM_NONE; 2672 2673 return skb->ip_summed == CHECKSUM_NONE; 2674 } 2675 2676 /** 2677 * __skb_gso_segment - Perform segmentation on skb. 2678 * @skb: buffer to segment 2679 * @features: features for the output path (see dev->features) 2680 * @tx_path: whether it is called in TX path 2681 * 2682 * This function segments the given skb and returns a list of segments. 2683 * 2684 * It may return NULL if the skb requires no segmentation. This is 2685 * only possible when GSO is used for verifying header integrity. 2686 * 2687 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. 2688 */ 2689 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2690 netdev_features_t features, bool tx_path) 2691 { 2692 struct sk_buff *segs; 2693 2694 if (unlikely(skb_needs_check(skb, tx_path))) { 2695 int err; 2696 2697 /* We're going to init ->check field in TCP or UDP header */ 2698 err = skb_cow_head(skb, 0); 2699 if (err < 0) 2700 return ERR_PTR(err); 2701 } 2702 2703 /* Only report GSO partial support if it will enable us to 2704 * support segmentation on this frame without needing additional 2705 * work. 2706 */ 2707 if (features & NETIF_F_GSO_PARTIAL) { 2708 netdev_features_t partial_features = NETIF_F_GSO_ROBUST; 2709 struct net_device *dev = skb->dev; 2710 2711 partial_features |= dev->features & dev->gso_partial_features; 2712 if (!skb_gso_ok(skb, features | partial_features)) 2713 features &= ~NETIF_F_GSO_PARTIAL; 2714 } 2715 2716 BUILD_BUG_ON(SKB_SGO_CB_OFFSET + 2717 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); 2718 2719 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2720 SKB_GSO_CB(skb)->encap_level = 0; 2721 2722 skb_reset_mac_header(skb); 2723 skb_reset_mac_len(skb); 2724 2725 segs = skb_mac_gso_segment(skb, features); 2726 2727 if (unlikely(skb_needs_check(skb, tx_path))) 2728 skb_warn_bad_offload(skb); 2729 2730 return segs; 2731 } 2732 EXPORT_SYMBOL(__skb_gso_segment); 2733 2734 /* Take action when hardware reception checksum errors are detected. */ 2735 #ifdef CONFIG_BUG 2736 void netdev_rx_csum_fault(struct net_device *dev) 2737 { 2738 if (net_ratelimit()) { 2739 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2740 dump_stack(); 2741 } 2742 } 2743 EXPORT_SYMBOL(netdev_rx_csum_fault); 2744 #endif 2745 2746 /* Actually, we should eliminate this check as soon as we know, that: 2747 * 1. IOMMU is present and allows to map all the memory. 2748 * 2. No high memory really exists on this machine. 2749 */ 2750 2751 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2752 { 2753 #ifdef CONFIG_HIGHMEM 2754 int i; 2755 2756 if (!(dev->features & NETIF_F_HIGHDMA)) { 2757 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2758 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2759 2760 if (PageHighMem(skb_frag_page(frag))) 2761 return 1; 2762 } 2763 } 2764 2765 if (PCI_DMA_BUS_IS_PHYS) { 2766 struct device *pdev = dev->dev.parent; 2767 2768 if (!pdev) 2769 return 0; 2770 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2771 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2772 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2773 2774 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2775 return 1; 2776 } 2777 } 2778 #endif 2779 return 0; 2780 } 2781 2782 /* If MPLS offload request, verify we are testing hardware MPLS features 2783 * instead of standard features for the netdev. 2784 */ 2785 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2786 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2787 netdev_features_t features, 2788 __be16 type) 2789 { 2790 if (eth_p_mpls(type)) 2791 features &= skb->dev->mpls_features; 2792 2793 return features; 2794 } 2795 #else 2796 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2797 netdev_features_t features, 2798 __be16 type) 2799 { 2800 return features; 2801 } 2802 #endif 2803 2804 static netdev_features_t harmonize_features(struct sk_buff *skb, 2805 netdev_features_t features) 2806 { 2807 int tmp; 2808 __be16 type; 2809 2810 type = skb_network_protocol(skb, &tmp); 2811 features = net_mpls_features(skb, features, type); 2812 2813 if (skb->ip_summed != CHECKSUM_NONE && 2814 !can_checksum_protocol(features, type)) { 2815 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); 2816 } 2817 if (illegal_highdma(skb->dev, skb)) 2818 features &= ~NETIF_F_SG; 2819 2820 return features; 2821 } 2822 2823 netdev_features_t passthru_features_check(struct sk_buff *skb, 2824 struct net_device *dev, 2825 netdev_features_t features) 2826 { 2827 return features; 2828 } 2829 EXPORT_SYMBOL(passthru_features_check); 2830 2831 static netdev_features_t dflt_features_check(const struct sk_buff *skb, 2832 struct net_device *dev, 2833 netdev_features_t features) 2834 { 2835 return vlan_features_check(skb, features); 2836 } 2837 2838 static netdev_features_t gso_features_check(const struct sk_buff *skb, 2839 struct net_device *dev, 2840 netdev_features_t features) 2841 { 2842 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2843 2844 if (gso_segs > dev->gso_max_segs) 2845 return features & ~NETIF_F_GSO_MASK; 2846 2847 /* Support for GSO partial features requires software 2848 * intervention before we can actually process the packets 2849 * so we need to strip support for any partial features now 2850 * and we can pull them back in after we have partially 2851 * segmented the frame. 2852 */ 2853 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) 2854 features &= ~dev->gso_partial_features; 2855 2856 /* Make sure to clear the IPv4 ID mangling feature if the 2857 * IPv4 header has the potential to be fragmented. 2858 */ 2859 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 2860 struct iphdr *iph = skb->encapsulation ? 2861 inner_ip_hdr(skb) : ip_hdr(skb); 2862 2863 if (!(iph->frag_off & htons(IP_DF))) 2864 features &= ~NETIF_F_TSO_MANGLEID; 2865 } 2866 2867 return features; 2868 } 2869 2870 netdev_features_t netif_skb_features(struct sk_buff *skb) 2871 { 2872 struct net_device *dev = skb->dev; 2873 netdev_features_t features = dev->features; 2874 2875 if (skb_is_gso(skb)) 2876 features = gso_features_check(skb, dev, features); 2877 2878 /* If encapsulation offload request, verify we are testing 2879 * hardware encapsulation features instead of standard 2880 * features for the netdev 2881 */ 2882 if (skb->encapsulation) 2883 features &= dev->hw_enc_features; 2884 2885 if (skb_vlan_tagged(skb)) 2886 features = netdev_intersect_features(features, 2887 dev->vlan_features | 2888 NETIF_F_HW_VLAN_CTAG_TX | 2889 NETIF_F_HW_VLAN_STAG_TX); 2890 2891 if (dev->netdev_ops->ndo_features_check) 2892 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2893 features); 2894 else 2895 features &= dflt_features_check(skb, dev, features); 2896 2897 return harmonize_features(skb, features); 2898 } 2899 EXPORT_SYMBOL(netif_skb_features); 2900 2901 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2902 struct netdev_queue *txq, bool more) 2903 { 2904 unsigned int len; 2905 int rc; 2906 2907 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2908 dev_queue_xmit_nit(skb, dev); 2909 2910 len = skb->len; 2911 trace_net_dev_start_xmit(skb, dev); 2912 rc = netdev_start_xmit(skb, dev, txq, more); 2913 trace_net_dev_xmit(skb, rc, dev, len); 2914 2915 return rc; 2916 } 2917 2918 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2919 struct netdev_queue *txq, int *ret) 2920 { 2921 struct sk_buff *skb = first; 2922 int rc = NETDEV_TX_OK; 2923 2924 while (skb) { 2925 struct sk_buff *next = skb->next; 2926 2927 skb->next = NULL; 2928 rc = xmit_one(skb, dev, txq, next != NULL); 2929 if (unlikely(!dev_xmit_complete(rc))) { 2930 skb->next = next; 2931 goto out; 2932 } 2933 2934 skb = next; 2935 if (netif_xmit_stopped(txq) && skb) { 2936 rc = NETDEV_TX_BUSY; 2937 break; 2938 } 2939 } 2940 2941 out: 2942 *ret = rc; 2943 return skb; 2944 } 2945 2946 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2947 netdev_features_t features) 2948 { 2949 if (skb_vlan_tag_present(skb) && 2950 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2951 skb = __vlan_hwaccel_push_inside(skb); 2952 return skb; 2953 } 2954 2955 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2956 { 2957 netdev_features_t features; 2958 2959 features = netif_skb_features(skb); 2960 skb = validate_xmit_vlan(skb, features); 2961 if (unlikely(!skb)) 2962 goto out_null; 2963 2964 if (netif_needs_gso(skb, features)) { 2965 struct sk_buff *segs; 2966 2967 segs = skb_gso_segment(skb, features); 2968 if (IS_ERR(segs)) { 2969 goto out_kfree_skb; 2970 } else if (segs) { 2971 consume_skb(skb); 2972 skb = segs; 2973 } 2974 } else { 2975 if (skb_needs_linearize(skb, features) && 2976 __skb_linearize(skb)) 2977 goto out_kfree_skb; 2978 2979 if (validate_xmit_xfrm(skb, features)) 2980 goto out_kfree_skb; 2981 2982 /* If packet is not checksummed and device does not 2983 * support checksumming for this protocol, complete 2984 * checksumming here. 2985 */ 2986 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2987 if (skb->encapsulation) 2988 skb_set_inner_transport_header(skb, 2989 skb_checksum_start_offset(skb)); 2990 else 2991 skb_set_transport_header(skb, 2992 skb_checksum_start_offset(skb)); 2993 if (!(features & NETIF_F_CSUM_MASK) && 2994 skb_checksum_help(skb)) 2995 goto out_kfree_skb; 2996 } 2997 } 2998 2999 return skb; 3000 3001 out_kfree_skb: 3002 kfree_skb(skb); 3003 out_null: 3004 atomic_long_inc(&dev->tx_dropped); 3005 return NULL; 3006 } 3007 3008 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 3009 { 3010 struct sk_buff *next, *head = NULL, *tail; 3011 3012 for (; skb != NULL; skb = next) { 3013 next = skb->next; 3014 skb->next = NULL; 3015 3016 /* in case skb wont be segmented, point to itself */ 3017 skb->prev = skb; 3018 3019 skb = validate_xmit_skb(skb, dev); 3020 if (!skb) 3021 continue; 3022 3023 if (!head) 3024 head = skb; 3025 else 3026 tail->next = skb; 3027 /* If skb was segmented, skb->prev points to 3028 * the last segment. If not, it still contains skb. 3029 */ 3030 tail = skb->prev; 3031 } 3032 return head; 3033 } 3034 EXPORT_SYMBOL_GPL(validate_xmit_skb_list); 3035 3036 static void qdisc_pkt_len_init(struct sk_buff *skb) 3037 { 3038 const struct skb_shared_info *shinfo = skb_shinfo(skb); 3039 3040 qdisc_skb_cb(skb)->pkt_len = skb->len; 3041 3042 /* To get more precise estimation of bytes sent on wire, 3043 * we add to pkt_len the headers size of all segments 3044 */ 3045 if (shinfo->gso_size) { 3046 unsigned int hdr_len; 3047 u16 gso_segs = shinfo->gso_segs; 3048 3049 /* mac layer + network layer */ 3050 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 3051 3052 /* + transport layer */ 3053 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 3054 hdr_len += tcp_hdrlen(skb); 3055 else 3056 hdr_len += sizeof(struct udphdr); 3057 3058 if (shinfo->gso_type & SKB_GSO_DODGY) 3059 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 3060 shinfo->gso_size); 3061 3062 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 3063 } 3064 } 3065 3066 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 3067 struct net_device *dev, 3068 struct netdev_queue *txq) 3069 { 3070 spinlock_t *root_lock = qdisc_lock(q); 3071 struct sk_buff *to_free = NULL; 3072 bool contended; 3073 int rc; 3074 3075 qdisc_calculate_pkt_len(skb, q); 3076 /* 3077 * Heuristic to force contended enqueues to serialize on a 3078 * separate lock before trying to get qdisc main lock. 3079 * This permits qdisc->running owner to get the lock more 3080 * often and dequeue packets faster. 3081 */ 3082 contended = qdisc_is_running(q); 3083 if (unlikely(contended)) 3084 spin_lock(&q->busylock); 3085 3086 spin_lock(root_lock); 3087 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 3088 __qdisc_drop(skb, &to_free); 3089 rc = NET_XMIT_DROP; 3090 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 3091 qdisc_run_begin(q)) { 3092 /* 3093 * This is a work-conserving queue; there are no old skbs 3094 * waiting to be sent out; and the qdisc is not running - 3095 * xmit the skb directly. 3096 */ 3097 3098 qdisc_bstats_update(q, skb); 3099 3100 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 3101 if (unlikely(contended)) { 3102 spin_unlock(&q->busylock); 3103 contended = false; 3104 } 3105 __qdisc_run(q); 3106 } else 3107 qdisc_run_end(q); 3108 3109 rc = NET_XMIT_SUCCESS; 3110 } else { 3111 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; 3112 if (qdisc_run_begin(q)) { 3113 if (unlikely(contended)) { 3114 spin_unlock(&q->busylock); 3115 contended = false; 3116 } 3117 __qdisc_run(q); 3118 } 3119 } 3120 spin_unlock(root_lock); 3121 if (unlikely(to_free)) 3122 kfree_skb_list(to_free); 3123 if (unlikely(contended)) 3124 spin_unlock(&q->busylock); 3125 return rc; 3126 } 3127 3128 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 3129 static void skb_update_prio(struct sk_buff *skb) 3130 { 3131 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 3132 3133 if (!skb->priority && skb->sk && map) { 3134 unsigned int prioidx = 3135 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); 3136 3137 if (prioidx < map->priomap_len) 3138 skb->priority = map->priomap[prioidx]; 3139 } 3140 } 3141 #else 3142 #define skb_update_prio(skb) 3143 #endif 3144 3145 DEFINE_PER_CPU(int, xmit_recursion); 3146 EXPORT_SYMBOL(xmit_recursion); 3147 3148 /** 3149 * dev_loopback_xmit - loop back @skb 3150 * @net: network namespace this loopback is happening in 3151 * @sk: sk needed to be a netfilter okfn 3152 * @skb: buffer to transmit 3153 */ 3154 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 3155 { 3156 skb_reset_mac_header(skb); 3157 __skb_pull(skb, skb_network_offset(skb)); 3158 skb->pkt_type = PACKET_LOOPBACK; 3159 skb->ip_summed = CHECKSUM_UNNECESSARY; 3160 WARN_ON(!skb_dst(skb)); 3161 skb_dst_force(skb); 3162 netif_rx_ni(skb); 3163 return 0; 3164 } 3165 EXPORT_SYMBOL(dev_loopback_xmit); 3166 3167 #ifdef CONFIG_NET_EGRESS 3168 static struct sk_buff * 3169 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) 3170 { 3171 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); 3172 struct tcf_result cl_res; 3173 3174 if (!cl) 3175 return skb; 3176 3177 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ 3178 qdisc_bstats_cpu_update(cl->q, skb); 3179 3180 switch (tc_classify(skb, cl, &cl_res, false)) { 3181 case TC_ACT_OK: 3182 case TC_ACT_RECLASSIFY: 3183 skb->tc_index = TC_H_MIN(cl_res.classid); 3184 break; 3185 case TC_ACT_SHOT: 3186 qdisc_qstats_cpu_drop(cl->q); 3187 *ret = NET_XMIT_DROP; 3188 kfree_skb(skb); 3189 return NULL; 3190 case TC_ACT_STOLEN: 3191 case TC_ACT_QUEUED: 3192 *ret = NET_XMIT_SUCCESS; 3193 consume_skb(skb); 3194 return NULL; 3195 case TC_ACT_REDIRECT: 3196 /* No need to push/pop skb's mac_header here on egress! */ 3197 skb_do_redirect(skb); 3198 *ret = NET_XMIT_SUCCESS; 3199 return NULL; 3200 default: 3201 break; 3202 } 3203 3204 return skb; 3205 } 3206 #endif /* CONFIG_NET_EGRESS */ 3207 3208 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 3209 { 3210 #ifdef CONFIG_XPS 3211 struct xps_dev_maps *dev_maps; 3212 struct xps_map *map; 3213 int queue_index = -1; 3214 3215 rcu_read_lock(); 3216 dev_maps = rcu_dereference(dev->xps_maps); 3217 if (dev_maps) { 3218 unsigned int tci = skb->sender_cpu - 1; 3219 3220 if (dev->num_tc) { 3221 tci *= dev->num_tc; 3222 tci += netdev_get_prio_tc_map(dev, skb->priority); 3223 } 3224 3225 map = rcu_dereference(dev_maps->cpu_map[tci]); 3226 if (map) { 3227 if (map->len == 1) 3228 queue_index = map->queues[0]; 3229 else 3230 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 3231 map->len)]; 3232 if (unlikely(queue_index >= dev->real_num_tx_queues)) 3233 queue_index = -1; 3234 } 3235 } 3236 rcu_read_unlock(); 3237 3238 return queue_index; 3239 #else 3240 return -1; 3241 #endif 3242 } 3243 3244 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 3245 { 3246 struct sock *sk = skb->sk; 3247 int queue_index = sk_tx_queue_get(sk); 3248 3249 if (queue_index < 0 || skb->ooo_okay || 3250 queue_index >= dev->real_num_tx_queues) { 3251 int new_index = get_xps_queue(dev, skb); 3252 3253 if (new_index < 0) 3254 new_index = skb_tx_hash(dev, skb); 3255 3256 if (queue_index != new_index && sk && 3257 sk_fullsock(sk) && 3258 rcu_access_pointer(sk->sk_dst_cache)) 3259 sk_tx_queue_set(sk, new_index); 3260 3261 queue_index = new_index; 3262 } 3263 3264 return queue_index; 3265 } 3266 3267 struct netdev_queue *netdev_pick_tx(struct net_device *dev, 3268 struct sk_buff *skb, 3269 void *accel_priv) 3270 { 3271 int queue_index = 0; 3272 3273 #ifdef CONFIG_XPS 3274 u32 sender_cpu = skb->sender_cpu - 1; 3275 3276 if (sender_cpu >= (u32)NR_CPUS) 3277 skb->sender_cpu = raw_smp_processor_id() + 1; 3278 #endif 3279 3280 if (dev->real_num_tx_queues != 1) { 3281 const struct net_device_ops *ops = dev->netdev_ops; 3282 3283 if (ops->ndo_select_queue) 3284 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3285 __netdev_pick_tx); 3286 else 3287 queue_index = __netdev_pick_tx(dev, skb); 3288 3289 if (!accel_priv) 3290 queue_index = netdev_cap_txqueue(dev, queue_index); 3291 } 3292 3293 skb_set_queue_mapping(skb, queue_index); 3294 return netdev_get_tx_queue(dev, queue_index); 3295 } 3296 3297 /** 3298 * __dev_queue_xmit - transmit a buffer 3299 * @skb: buffer to transmit 3300 * @accel_priv: private data used for L2 forwarding offload 3301 * 3302 * Queue a buffer for transmission to a network device. The caller must 3303 * have set the device and priority and built the buffer before calling 3304 * this function. The function can be called from an interrupt. 3305 * 3306 * A negative errno code is returned on a failure. A success does not 3307 * guarantee the frame will be transmitted as it may be dropped due 3308 * to congestion or traffic shaping. 3309 * 3310 * ----------------------------------------------------------------------------------- 3311 * I notice this method can also return errors from the queue disciplines, 3312 * including NET_XMIT_DROP, which is a positive value. So, errors can also 3313 * be positive. 3314 * 3315 * Regardless of the return value, the skb is consumed, so it is currently 3316 * difficult to retry a send to this method. (You can bump the ref count 3317 * before sending to hold a reference for retry if you are careful.) 3318 * 3319 * When calling this method, interrupts MUST be enabled. This is because 3320 * the BH enable code must have IRQs enabled so that it will not deadlock. 3321 * --BLG 3322 */ 3323 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 3324 { 3325 struct net_device *dev = skb->dev; 3326 struct netdev_queue *txq; 3327 struct Qdisc *q; 3328 int rc = -ENOMEM; 3329 3330 skb_reset_mac_header(skb); 3331 3332 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 3333 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 3334 3335 /* Disable soft irqs for various locks below. Also 3336 * stops preemption for RCU. 3337 */ 3338 rcu_read_lock_bh(); 3339 3340 skb_update_prio(skb); 3341 3342 qdisc_pkt_len_init(skb); 3343 #ifdef CONFIG_NET_CLS_ACT 3344 skb->tc_at_ingress = 0; 3345 # ifdef CONFIG_NET_EGRESS 3346 if (static_key_false(&egress_needed)) { 3347 skb = sch_handle_egress(skb, &rc, dev); 3348 if (!skb) 3349 goto out; 3350 } 3351 # endif 3352 #endif 3353 /* If device/qdisc don't need skb->dst, release it right now while 3354 * its hot in this cpu cache. 3355 */ 3356 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 3357 skb_dst_drop(skb); 3358 else 3359 skb_dst_force(skb); 3360 3361 txq = netdev_pick_tx(dev, skb, accel_priv); 3362 q = rcu_dereference_bh(txq->qdisc); 3363 3364 trace_net_dev_queue(skb); 3365 if (q->enqueue) { 3366 rc = __dev_xmit_skb(skb, q, dev, txq); 3367 goto out; 3368 } 3369 3370 /* The device has no queue. Common case for software devices: 3371 * loopback, all the sorts of tunnels... 3372 3373 * Really, it is unlikely that netif_tx_lock protection is necessary 3374 * here. (f.e. loopback and IP tunnels are clean ignoring statistics 3375 * counters.) 3376 * However, it is possible, that they rely on protection 3377 * made by us here. 3378 3379 * Check this and shot the lock. It is not prone from deadlocks. 3380 *Either shot noqueue qdisc, it is even simpler 8) 3381 */ 3382 if (dev->flags & IFF_UP) { 3383 int cpu = smp_processor_id(); /* ok because BHs are off */ 3384 3385 if (txq->xmit_lock_owner != cpu) { 3386 if (unlikely(__this_cpu_read(xmit_recursion) > 3387 XMIT_RECURSION_LIMIT)) 3388 goto recursion_alert; 3389 3390 skb = validate_xmit_skb(skb, dev); 3391 if (!skb) 3392 goto out; 3393 3394 HARD_TX_LOCK(dev, txq, cpu); 3395 3396 if (!netif_xmit_stopped(txq)) { 3397 __this_cpu_inc(xmit_recursion); 3398 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 3399 __this_cpu_dec(xmit_recursion); 3400 if (dev_xmit_complete(rc)) { 3401 HARD_TX_UNLOCK(dev, txq); 3402 goto out; 3403 } 3404 } 3405 HARD_TX_UNLOCK(dev, txq); 3406 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 3407 dev->name); 3408 } else { 3409 /* Recursion is detected! It is possible, 3410 * unfortunately 3411 */ 3412 recursion_alert: 3413 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 3414 dev->name); 3415 } 3416 } 3417 3418 rc = -ENETDOWN; 3419 rcu_read_unlock_bh(); 3420 3421 atomic_long_inc(&dev->tx_dropped); 3422 kfree_skb_list(skb); 3423 return rc; 3424 out: 3425 rcu_read_unlock_bh(); 3426 return rc; 3427 } 3428 3429 int dev_queue_xmit(struct sk_buff *skb) 3430 { 3431 return __dev_queue_xmit(skb, NULL); 3432 } 3433 EXPORT_SYMBOL(dev_queue_xmit); 3434 3435 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3436 { 3437 return __dev_queue_xmit(skb, accel_priv); 3438 } 3439 EXPORT_SYMBOL(dev_queue_xmit_accel); 3440 3441 3442 /************************************************************************* 3443 * Receiver routines 3444 *************************************************************************/ 3445 3446 int netdev_max_backlog __read_mostly = 1000; 3447 EXPORT_SYMBOL(netdev_max_backlog); 3448 3449 int netdev_tstamp_prequeue __read_mostly = 1; 3450 int netdev_budget __read_mostly = 300; 3451 unsigned int __read_mostly netdev_budget_usecs = 2000; 3452 int weight_p __read_mostly = 64; /* old backlog weight */ 3453 int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ 3454 int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ 3455 int dev_rx_weight __read_mostly = 64; 3456 int dev_tx_weight __read_mostly = 64; 3457 3458 /* Called with irq disabled */ 3459 static inline void ____napi_schedule(struct softnet_data *sd, 3460 struct napi_struct *napi) 3461 { 3462 list_add_tail(&napi->poll_list, &sd->poll_list); 3463 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3464 } 3465 3466 #ifdef CONFIG_RPS 3467 3468 /* One global table that all flow-based protocols share. */ 3469 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3470 EXPORT_SYMBOL(rps_sock_flow_table); 3471 u32 rps_cpu_mask __read_mostly; 3472 EXPORT_SYMBOL(rps_cpu_mask); 3473 3474 struct static_key rps_needed __read_mostly; 3475 EXPORT_SYMBOL(rps_needed); 3476 struct static_key rfs_needed __read_mostly; 3477 EXPORT_SYMBOL(rfs_needed); 3478 3479 static struct rps_dev_flow * 3480 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3481 struct rps_dev_flow *rflow, u16 next_cpu) 3482 { 3483 if (next_cpu < nr_cpu_ids) { 3484 #ifdef CONFIG_RFS_ACCEL 3485 struct netdev_rx_queue *rxqueue; 3486 struct rps_dev_flow_table *flow_table; 3487 struct rps_dev_flow *old_rflow; 3488 u32 flow_id; 3489 u16 rxq_index; 3490 int rc; 3491 3492 /* Should we steer this flow to a different hardware queue? */ 3493 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3494 !(dev->features & NETIF_F_NTUPLE)) 3495 goto out; 3496 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3497 if (rxq_index == skb_get_rx_queue(skb)) 3498 goto out; 3499 3500 rxqueue = dev->_rx + rxq_index; 3501 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3502 if (!flow_table) 3503 goto out; 3504 flow_id = skb_get_hash(skb) & flow_table->mask; 3505 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3506 rxq_index, flow_id); 3507 if (rc < 0) 3508 goto out; 3509 old_rflow = rflow; 3510 rflow = &flow_table->flows[flow_id]; 3511 rflow->filter = rc; 3512 if (old_rflow->filter == rflow->filter) 3513 old_rflow->filter = RPS_NO_FILTER; 3514 out: 3515 #endif 3516 rflow->last_qtail = 3517 per_cpu(softnet_data, next_cpu).input_queue_head; 3518 } 3519 3520 rflow->cpu = next_cpu; 3521 return rflow; 3522 } 3523 3524 /* 3525 * get_rps_cpu is called from netif_receive_skb and returns the target 3526 * CPU from the RPS map of the receiving queue for a given skb. 3527 * rcu_read_lock must be held on entry. 3528 */ 3529 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3530 struct rps_dev_flow **rflowp) 3531 { 3532 const struct rps_sock_flow_table *sock_flow_table; 3533 struct netdev_rx_queue *rxqueue = dev->_rx; 3534 struct rps_dev_flow_table *flow_table; 3535 struct rps_map *map; 3536 int cpu = -1; 3537 u32 tcpu; 3538 u32 hash; 3539 3540 if (skb_rx_queue_recorded(skb)) { 3541 u16 index = skb_get_rx_queue(skb); 3542 3543 if (unlikely(index >= dev->real_num_rx_queues)) { 3544 WARN_ONCE(dev->real_num_rx_queues > 1, 3545 "%s received packet on queue %u, but number " 3546 "of RX queues is %u\n", 3547 dev->name, index, dev->real_num_rx_queues); 3548 goto done; 3549 } 3550 rxqueue += index; 3551 } 3552 3553 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3554 3555 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3556 map = rcu_dereference(rxqueue->rps_map); 3557 if (!flow_table && !map) 3558 goto done; 3559 3560 skb_reset_network_header(skb); 3561 hash = skb_get_hash(skb); 3562 if (!hash) 3563 goto done; 3564 3565 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3566 if (flow_table && sock_flow_table) { 3567 struct rps_dev_flow *rflow; 3568 u32 next_cpu; 3569 u32 ident; 3570 3571 /* First check into global flow table if there is a match */ 3572 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3573 if ((ident ^ hash) & ~rps_cpu_mask) 3574 goto try_rps; 3575 3576 next_cpu = ident & rps_cpu_mask; 3577 3578 /* OK, now we know there is a match, 3579 * we can look at the local (per receive queue) flow table 3580 */ 3581 rflow = &flow_table->flows[hash & flow_table->mask]; 3582 tcpu = rflow->cpu; 3583 3584 /* 3585 * If the desired CPU (where last recvmsg was done) is 3586 * different from current CPU (one in the rx-queue flow 3587 * table entry), switch if one of the following holds: 3588 * - Current CPU is unset (>= nr_cpu_ids). 3589 * - Current CPU is offline. 3590 * - The current CPU's queue tail has advanced beyond the 3591 * last packet that was enqueued using this table entry. 3592 * This guarantees that all previous packets for the flow 3593 * have been dequeued, thus preserving in order delivery. 3594 */ 3595 if (unlikely(tcpu != next_cpu) && 3596 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || 3597 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3598 rflow->last_qtail)) >= 0)) { 3599 tcpu = next_cpu; 3600 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3601 } 3602 3603 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { 3604 *rflowp = rflow; 3605 cpu = tcpu; 3606 goto done; 3607 } 3608 } 3609 3610 try_rps: 3611 3612 if (map) { 3613 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3614 if (cpu_online(tcpu)) { 3615 cpu = tcpu; 3616 goto done; 3617 } 3618 } 3619 3620 done: 3621 return cpu; 3622 } 3623 3624 #ifdef CONFIG_RFS_ACCEL 3625 3626 /** 3627 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3628 * @dev: Device on which the filter was set 3629 * @rxq_index: RX queue index 3630 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3631 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3632 * 3633 * Drivers that implement ndo_rx_flow_steer() should periodically call 3634 * this function for each installed filter and remove the filters for 3635 * which it returns %true. 3636 */ 3637 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3638 u32 flow_id, u16 filter_id) 3639 { 3640 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3641 struct rps_dev_flow_table *flow_table; 3642 struct rps_dev_flow *rflow; 3643 bool expire = true; 3644 unsigned int cpu; 3645 3646 rcu_read_lock(); 3647 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3648 if (flow_table && flow_id <= flow_table->mask) { 3649 rflow = &flow_table->flows[flow_id]; 3650 cpu = ACCESS_ONCE(rflow->cpu); 3651 if (rflow->filter == filter_id && cpu < nr_cpu_ids && 3652 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3653 rflow->last_qtail) < 3654 (int)(10 * flow_table->mask))) 3655 expire = false; 3656 } 3657 rcu_read_unlock(); 3658 return expire; 3659 } 3660 EXPORT_SYMBOL(rps_may_expire_flow); 3661 3662 #endif /* CONFIG_RFS_ACCEL */ 3663 3664 /* Called from hardirq (IPI) context */ 3665 static void rps_trigger_softirq(void *data) 3666 { 3667 struct softnet_data *sd = data; 3668 3669 ____napi_schedule(sd, &sd->backlog); 3670 sd->received_rps++; 3671 } 3672 3673 #endif /* CONFIG_RPS */ 3674 3675 /* 3676 * Check if this softnet_data structure is another cpu one 3677 * If yes, queue it to our IPI list and return 1 3678 * If no, return 0 3679 */ 3680 static int rps_ipi_queued(struct softnet_data *sd) 3681 { 3682 #ifdef CONFIG_RPS 3683 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3684 3685 if (sd != mysd) { 3686 sd->rps_ipi_next = mysd->rps_ipi_list; 3687 mysd->rps_ipi_list = sd; 3688 3689 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3690 return 1; 3691 } 3692 #endif /* CONFIG_RPS */ 3693 return 0; 3694 } 3695 3696 #ifdef CONFIG_NET_FLOW_LIMIT 3697 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3698 #endif 3699 3700 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3701 { 3702 #ifdef CONFIG_NET_FLOW_LIMIT 3703 struct sd_flow_limit *fl; 3704 struct softnet_data *sd; 3705 unsigned int old_flow, new_flow; 3706 3707 if (qlen < (netdev_max_backlog >> 1)) 3708 return false; 3709 3710 sd = this_cpu_ptr(&softnet_data); 3711 3712 rcu_read_lock(); 3713 fl = rcu_dereference(sd->flow_limit); 3714 if (fl) { 3715 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3716 old_flow = fl->history[fl->history_head]; 3717 fl->history[fl->history_head] = new_flow; 3718 3719 fl->history_head++; 3720 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3721 3722 if (likely(fl->buckets[old_flow])) 3723 fl->buckets[old_flow]--; 3724 3725 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3726 fl->count++; 3727 rcu_read_unlock(); 3728 return true; 3729 } 3730 } 3731 rcu_read_unlock(); 3732 #endif 3733 return false; 3734 } 3735 3736 /* 3737 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3738 * queue (may be a remote CPU queue). 3739 */ 3740 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3741 unsigned int *qtail) 3742 { 3743 struct softnet_data *sd; 3744 unsigned long flags; 3745 unsigned int qlen; 3746 3747 sd = &per_cpu(softnet_data, cpu); 3748 3749 local_irq_save(flags); 3750 3751 rps_lock(sd); 3752 if (!netif_running(skb->dev)) 3753 goto drop; 3754 qlen = skb_queue_len(&sd->input_pkt_queue); 3755 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3756 if (qlen) { 3757 enqueue: 3758 __skb_queue_tail(&sd->input_pkt_queue, skb); 3759 input_queue_tail_incr_save(sd, qtail); 3760 rps_unlock(sd); 3761 local_irq_restore(flags); 3762 return NET_RX_SUCCESS; 3763 } 3764 3765 /* Schedule NAPI for backlog device 3766 * We can use non atomic operation since we own the queue lock 3767 */ 3768 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3769 if (!rps_ipi_queued(sd)) 3770 ____napi_schedule(sd, &sd->backlog); 3771 } 3772 goto enqueue; 3773 } 3774 3775 drop: 3776 sd->dropped++; 3777 rps_unlock(sd); 3778 3779 local_irq_restore(flags); 3780 3781 atomic_long_inc(&skb->dev->rx_dropped); 3782 kfree_skb(skb); 3783 return NET_RX_DROP; 3784 } 3785 3786 static int netif_rx_internal(struct sk_buff *skb) 3787 { 3788 int ret; 3789 3790 net_timestamp_check(netdev_tstamp_prequeue, skb); 3791 3792 trace_netif_rx(skb); 3793 #ifdef CONFIG_RPS 3794 if (static_key_false(&rps_needed)) { 3795 struct rps_dev_flow voidflow, *rflow = &voidflow; 3796 int cpu; 3797 3798 preempt_disable(); 3799 rcu_read_lock(); 3800 3801 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3802 if (cpu < 0) 3803 cpu = smp_processor_id(); 3804 3805 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3806 3807 rcu_read_unlock(); 3808 preempt_enable(); 3809 } else 3810 #endif 3811 { 3812 unsigned int qtail; 3813 3814 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3815 put_cpu(); 3816 } 3817 return ret; 3818 } 3819 3820 /** 3821 * netif_rx - post buffer to the network code 3822 * @skb: buffer to post 3823 * 3824 * This function receives a packet from a device driver and queues it for 3825 * the upper (protocol) levels to process. It always succeeds. The buffer 3826 * may be dropped during processing for congestion control or by the 3827 * protocol layers. 3828 * 3829 * return values: 3830 * NET_RX_SUCCESS (no congestion) 3831 * NET_RX_DROP (packet was dropped) 3832 * 3833 */ 3834 3835 int netif_rx(struct sk_buff *skb) 3836 { 3837 trace_netif_rx_entry(skb); 3838 3839 return netif_rx_internal(skb); 3840 } 3841 EXPORT_SYMBOL(netif_rx); 3842 3843 int netif_rx_ni(struct sk_buff *skb) 3844 { 3845 int err; 3846 3847 trace_netif_rx_ni_entry(skb); 3848 3849 preempt_disable(); 3850 err = netif_rx_internal(skb); 3851 if (local_softirq_pending()) 3852 do_softirq(); 3853 preempt_enable(); 3854 3855 return err; 3856 } 3857 EXPORT_SYMBOL(netif_rx_ni); 3858 3859 static __latent_entropy void net_tx_action(struct softirq_action *h) 3860 { 3861 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3862 3863 if (sd->completion_queue) { 3864 struct sk_buff *clist; 3865 3866 local_irq_disable(); 3867 clist = sd->completion_queue; 3868 sd->completion_queue = NULL; 3869 local_irq_enable(); 3870 3871 while (clist) { 3872 struct sk_buff *skb = clist; 3873 3874 clist = clist->next; 3875 3876 WARN_ON(atomic_read(&skb->users)); 3877 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3878 trace_consume_skb(skb); 3879 else 3880 trace_kfree_skb(skb, net_tx_action); 3881 3882 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) 3883 __kfree_skb(skb); 3884 else 3885 __kfree_skb_defer(skb); 3886 } 3887 3888 __kfree_skb_flush(); 3889 } 3890 3891 if (sd->output_queue) { 3892 struct Qdisc *head; 3893 3894 local_irq_disable(); 3895 head = sd->output_queue; 3896 sd->output_queue = NULL; 3897 sd->output_queue_tailp = &sd->output_queue; 3898 local_irq_enable(); 3899 3900 while (head) { 3901 struct Qdisc *q = head; 3902 spinlock_t *root_lock; 3903 3904 head = head->next_sched; 3905 3906 root_lock = qdisc_lock(q); 3907 spin_lock(root_lock); 3908 /* We need to make sure head->next_sched is read 3909 * before clearing __QDISC_STATE_SCHED 3910 */ 3911 smp_mb__before_atomic(); 3912 clear_bit(__QDISC_STATE_SCHED, &q->state); 3913 qdisc_run(q); 3914 spin_unlock(root_lock); 3915 } 3916 } 3917 } 3918 3919 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) 3920 /* This hook is defined here for ATM LANE */ 3921 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3922 unsigned char *addr) __read_mostly; 3923 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3924 #endif 3925 3926 static inline struct sk_buff * 3927 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, 3928 struct net_device *orig_dev) 3929 { 3930 #ifdef CONFIG_NET_CLS_ACT 3931 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); 3932 struct tcf_result cl_res; 3933 3934 /* If there's at least one ingress present somewhere (so 3935 * we get here via enabled static key), remaining devices 3936 * that are not configured with an ingress qdisc will bail 3937 * out here. 3938 */ 3939 if (!cl) 3940 return skb; 3941 if (*pt_prev) { 3942 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3943 *pt_prev = NULL; 3944 } 3945 3946 qdisc_skb_cb(skb)->pkt_len = skb->len; 3947 skb->tc_at_ingress = 1; 3948 qdisc_bstats_cpu_update(cl->q, skb); 3949 3950 switch (tc_classify(skb, cl, &cl_res, false)) { 3951 case TC_ACT_OK: 3952 case TC_ACT_RECLASSIFY: 3953 skb->tc_index = TC_H_MIN(cl_res.classid); 3954 break; 3955 case TC_ACT_SHOT: 3956 qdisc_qstats_cpu_drop(cl->q); 3957 kfree_skb(skb); 3958 return NULL; 3959 case TC_ACT_STOLEN: 3960 case TC_ACT_QUEUED: 3961 consume_skb(skb); 3962 return NULL; 3963 case TC_ACT_REDIRECT: 3964 /* skb_mac_header check was done by cls/act_bpf, so 3965 * we can safely push the L2 header back before 3966 * redirecting to another netdev 3967 */ 3968 __skb_push(skb, skb->mac_len); 3969 skb_do_redirect(skb); 3970 return NULL; 3971 default: 3972 break; 3973 } 3974 #endif /* CONFIG_NET_CLS_ACT */ 3975 return skb; 3976 } 3977 3978 /** 3979 * netdev_is_rx_handler_busy - check if receive handler is registered 3980 * @dev: device to check 3981 * 3982 * Check if a receive handler is already registered for a given device. 3983 * Return true if there one. 3984 * 3985 * The caller must hold the rtnl_mutex. 3986 */ 3987 bool netdev_is_rx_handler_busy(struct net_device *dev) 3988 { 3989 ASSERT_RTNL(); 3990 return dev && rtnl_dereference(dev->rx_handler); 3991 } 3992 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); 3993 3994 /** 3995 * netdev_rx_handler_register - register receive handler 3996 * @dev: device to register a handler for 3997 * @rx_handler: receive handler to register 3998 * @rx_handler_data: data pointer that is used by rx handler 3999 * 4000 * Register a receive handler for a device. This handler will then be 4001 * called from __netif_receive_skb. A negative errno code is returned 4002 * on a failure. 4003 * 4004 * The caller must hold the rtnl_mutex. 4005 * 4006 * For a general description of rx_handler, see enum rx_handler_result. 4007 */ 4008 int netdev_rx_handler_register(struct net_device *dev, 4009 rx_handler_func_t *rx_handler, 4010 void *rx_handler_data) 4011 { 4012 if (netdev_is_rx_handler_busy(dev)) 4013 return -EBUSY; 4014 4015 /* Note: rx_handler_data must be set before rx_handler */ 4016 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 4017 rcu_assign_pointer(dev->rx_handler, rx_handler); 4018 4019 return 0; 4020 } 4021 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 4022 4023 /** 4024 * netdev_rx_handler_unregister - unregister receive handler 4025 * @dev: device to unregister a handler from 4026 * 4027 * Unregister a receive handler from a device. 4028 * 4029 * The caller must hold the rtnl_mutex. 4030 */ 4031 void netdev_rx_handler_unregister(struct net_device *dev) 4032 { 4033 4034 ASSERT_RTNL(); 4035 RCU_INIT_POINTER(dev->rx_handler, NULL); 4036 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 4037 * section has a guarantee to see a non NULL rx_handler_data 4038 * as well. 4039 */ 4040 synchronize_net(); 4041 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 4042 } 4043 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 4044 4045 /* 4046 * Limit the use of PFMEMALLOC reserves to those protocols that implement 4047 * the special handling of PFMEMALLOC skbs. 4048 */ 4049 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 4050 { 4051 switch (skb->protocol) { 4052 case htons(ETH_P_ARP): 4053 case htons(ETH_P_IP): 4054 case htons(ETH_P_IPV6): 4055 case htons(ETH_P_8021Q): 4056 case htons(ETH_P_8021AD): 4057 return true; 4058 default: 4059 return false; 4060 } 4061 } 4062 4063 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, 4064 int *ret, struct net_device *orig_dev) 4065 { 4066 #ifdef CONFIG_NETFILTER_INGRESS 4067 if (nf_hook_ingress_active(skb)) { 4068 int ingress_retval; 4069 4070 if (*pt_prev) { 4071 *ret = deliver_skb(skb, *pt_prev, orig_dev); 4072 *pt_prev = NULL; 4073 } 4074 4075 rcu_read_lock(); 4076 ingress_retval = nf_hook_ingress(skb); 4077 rcu_read_unlock(); 4078 return ingress_retval; 4079 } 4080 #endif /* CONFIG_NETFILTER_INGRESS */ 4081 return 0; 4082 } 4083 4084 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 4085 { 4086 struct packet_type *ptype, *pt_prev; 4087 rx_handler_func_t *rx_handler; 4088 struct net_device *orig_dev; 4089 bool deliver_exact = false; 4090 int ret = NET_RX_DROP; 4091 __be16 type; 4092 4093 net_timestamp_check(!netdev_tstamp_prequeue, skb); 4094 4095 trace_netif_receive_skb(skb); 4096 4097 orig_dev = skb->dev; 4098 4099 skb_reset_network_header(skb); 4100 if (!skb_transport_header_was_set(skb)) 4101 skb_reset_transport_header(skb); 4102 skb_reset_mac_len(skb); 4103 4104 pt_prev = NULL; 4105 4106 another_round: 4107 skb->skb_iif = skb->dev->ifindex; 4108 4109 __this_cpu_inc(softnet_data.processed); 4110 4111 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 4112 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 4113 skb = skb_vlan_untag(skb); 4114 if (unlikely(!skb)) 4115 goto out; 4116 } 4117 4118 if (skb_skip_tc_classify(skb)) 4119 goto skip_classify; 4120 4121 if (pfmemalloc) 4122 goto skip_taps; 4123 4124 list_for_each_entry_rcu(ptype, &ptype_all, list) { 4125 if (pt_prev) 4126 ret = deliver_skb(skb, pt_prev, orig_dev); 4127 pt_prev = ptype; 4128 } 4129 4130 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 4131 if (pt_prev) 4132 ret = deliver_skb(skb, pt_prev, orig_dev); 4133 pt_prev = ptype; 4134 } 4135 4136 skip_taps: 4137 #ifdef CONFIG_NET_INGRESS 4138 if (static_key_false(&ingress_needed)) { 4139 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); 4140 if (!skb) 4141 goto out; 4142 4143 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) 4144 goto out; 4145 } 4146 #endif 4147 skb_reset_tc(skb); 4148 skip_classify: 4149 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 4150 goto drop; 4151 4152 if (skb_vlan_tag_present(skb)) { 4153 if (pt_prev) { 4154 ret = deliver_skb(skb, pt_prev, orig_dev); 4155 pt_prev = NULL; 4156 } 4157 if (vlan_do_receive(&skb)) 4158 goto another_round; 4159 else if (unlikely(!skb)) 4160 goto out; 4161 } 4162 4163 rx_handler = rcu_dereference(skb->dev->rx_handler); 4164 if (rx_handler) { 4165 if (pt_prev) { 4166 ret = deliver_skb(skb, pt_prev, orig_dev); 4167 pt_prev = NULL; 4168 } 4169 switch (rx_handler(&skb)) { 4170 case RX_HANDLER_CONSUMED: 4171 ret = NET_RX_SUCCESS; 4172 goto out; 4173 case RX_HANDLER_ANOTHER: 4174 goto another_round; 4175 case RX_HANDLER_EXACT: 4176 deliver_exact = true; 4177 case RX_HANDLER_PASS: 4178 break; 4179 default: 4180 BUG(); 4181 } 4182 } 4183 4184 if (unlikely(skb_vlan_tag_present(skb))) { 4185 if (skb_vlan_tag_get_id(skb)) 4186 skb->pkt_type = PACKET_OTHERHOST; 4187 /* Note: we might in the future use prio bits 4188 * and set skb->priority like in vlan_do_receive() 4189 * For the time being, just ignore Priority Code Point 4190 */ 4191 skb->vlan_tci = 0; 4192 } 4193 4194 type = skb->protocol; 4195 4196 /* deliver only exact match when indicated */ 4197 if (likely(!deliver_exact)) { 4198 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4199 &ptype_base[ntohs(type) & 4200 PTYPE_HASH_MASK]); 4201 } 4202 4203 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4204 &orig_dev->ptype_specific); 4205 4206 if (unlikely(skb->dev != orig_dev)) { 4207 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4208 &skb->dev->ptype_specific); 4209 } 4210 4211 if (pt_prev) { 4212 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 4213 goto drop; 4214 else 4215 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 4216 } else { 4217 drop: 4218 if (!deliver_exact) 4219 atomic_long_inc(&skb->dev->rx_dropped); 4220 else 4221 atomic_long_inc(&skb->dev->rx_nohandler); 4222 kfree_skb(skb); 4223 /* Jamal, now you will not able to escape explaining 4224 * me how you were going to use this. :-) 4225 */ 4226 ret = NET_RX_DROP; 4227 } 4228 4229 out: 4230 return ret; 4231 } 4232 4233 static int __netif_receive_skb(struct sk_buff *skb) 4234 { 4235 int ret; 4236 4237 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 4238 unsigned long pflags = current->flags; 4239 4240 /* 4241 * PFMEMALLOC skbs are special, they should 4242 * - be delivered to SOCK_MEMALLOC sockets only 4243 * - stay away from userspace 4244 * - have bounded memory usage 4245 * 4246 * Use PF_MEMALLOC as this saves us from propagating the allocation 4247 * context down to all allocation sites. 4248 */ 4249 current->flags |= PF_MEMALLOC; 4250 ret = __netif_receive_skb_core(skb, true); 4251 current_restore_flags(pflags, PF_MEMALLOC); 4252 } else 4253 ret = __netif_receive_skb_core(skb, false); 4254 4255 return ret; 4256 } 4257 4258 static struct static_key generic_xdp_needed __read_mostly; 4259 4260 static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) 4261 { 4262 struct bpf_prog *new = xdp->prog; 4263 int ret = 0; 4264 4265 switch (xdp->command) { 4266 case XDP_SETUP_PROG: { 4267 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); 4268 4269 rcu_assign_pointer(dev->xdp_prog, new); 4270 if (old) 4271 bpf_prog_put(old); 4272 4273 if (old && !new) { 4274 static_key_slow_dec(&generic_xdp_needed); 4275 } else if (new && !old) { 4276 static_key_slow_inc(&generic_xdp_needed); 4277 dev_disable_lro(dev); 4278 } 4279 break; 4280 } 4281 4282 case XDP_QUERY_PROG: 4283 xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog); 4284 break; 4285 4286 default: 4287 ret = -EINVAL; 4288 break; 4289 } 4290 4291 return ret; 4292 } 4293 4294 static u32 netif_receive_generic_xdp(struct sk_buff *skb, 4295 struct bpf_prog *xdp_prog) 4296 { 4297 struct xdp_buff xdp; 4298 u32 act = XDP_DROP; 4299 void *orig_data; 4300 int hlen, off; 4301 u32 mac_len; 4302 4303 /* Reinjected packets coming from act_mirred or similar should 4304 * not get XDP generic processing. 4305 */ 4306 if (skb_cloned(skb)) 4307 return XDP_PASS; 4308 4309 if (skb_linearize(skb)) 4310 goto do_drop; 4311 4312 /* The XDP program wants to see the packet starting at the MAC 4313 * header. 4314 */ 4315 mac_len = skb->data - skb_mac_header(skb); 4316 hlen = skb_headlen(skb) + mac_len; 4317 xdp.data = skb->data - mac_len; 4318 xdp.data_end = xdp.data + hlen; 4319 xdp.data_hard_start = skb->data - skb_headroom(skb); 4320 orig_data = xdp.data; 4321 4322 act = bpf_prog_run_xdp(xdp_prog, &xdp); 4323 4324 off = xdp.data - orig_data; 4325 if (off > 0) 4326 __skb_pull(skb, off); 4327 else if (off < 0) 4328 __skb_push(skb, -off); 4329 4330 switch (act) { 4331 case XDP_TX: 4332 __skb_push(skb, mac_len); 4333 /* fall through */ 4334 case XDP_PASS: 4335 break; 4336 4337 default: 4338 bpf_warn_invalid_xdp_action(act); 4339 /* fall through */ 4340 case XDP_ABORTED: 4341 trace_xdp_exception(skb->dev, xdp_prog, act); 4342 /* fall through */ 4343 case XDP_DROP: 4344 do_drop: 4345 kfree_skb(skb); 4346 break; 4347 } 4348 4349 return act; 4350 } 4351 4352 /* When doing generic XDP we have to bypass the qdisc layer and the 4353 * network taps in order to match in-driver-XDP behavior. 4354 */ 4355 static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) 4356 { 4357 struct net_device *dev = skb->dev; 4358 struct netdev_queue *txq; 4359 bool free_skb = true; 4360 int cpu, rc; 4361 4362 txq = netdev_pick_tx(dev, skb, NULL); 4363 cpu = smp_processor_id(); 4364 HARD_TX_LOCK(dev, txq, cpu); 4365 if (!netif_xmit_stopped(txq)) { 4366 rc = netdev_start_xmit(skb, dev, txq, 0); 4367 if (dev_xmit_complete(rc)) 4368 free_skb = false; 4369 } 4370 HARD_TX_UNLOCK(dev, txq); 4371 if (free_skb) { 4372 trace_xdp_exception(dev, xdp_prog, XDP_TX); 4373 kfree_skb(skb); 4374 } 4375 } 4376 4377 static int netif_receive_skb_internal(struct sk_buff *skb) 4378 { 4379 int ret; 4380 4381 net_timestamp_check(netdev_tstamp_prequeue, skb); 4382 4383 if (skb_defer_rx_timestamp(skb)) 4384 return NET_RX_SUCCESS; 4385 4386 rcu_read_lock(); 4387 4388 if (static_key_false(&generic_xdp_needed)) { 4389 struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); 4390 4391 if (xdp_prog) { 4392 u32 act = netif_receive_generic_xdp(skb, xdp_prog); 4393 4394 if (act != XDP_PASS) { 4395 rcu_read_unlock(); 4396 if (act == XDP_TX) 4397 generic_xdp_tx(skb, xdp_prog); 4398 return NET_RX_DROP; 4399 } 4400 } 4401 } 4402 4403 #ifdef CONFIG_RPS 4404 if (static_key_false(&rps_needed)) { 4405 struct rps_dev_flow voidflow, *rflow = &voidflow; 4406 int cpu = get_rps_cpu(skb->dev, skb, &rflow); 4407 4408 if (cpu >= 0) { 4409 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 4410 rcu_read_unlock(); 4411 return ret; 4412 } 4413 } 4414 #endif 4415 ret = __netif_receive_skb(skb); 4416 rcu_read_unlock(); 4417 return ret; 4418 } 4419 4420 /** 4421 * netif_receive_skb - process receive buffer from network 4422 * @skb: buffer to process 4423 * 4424 * netif_receive_skb() is the main receive data processing function. 4425 * It always succeeds. The buffer may be dropped during processing 4426 * for congestion control or by the protocol layers. 4427 * 4428 * This function may only be called from softirq context and interrupts 4429 * should be enabled. 4430 * 4431 * Return values (usually ignored): 4432 * NET_RX_SUCCESS: no congestion 4433 * NET_RX_DROP: packet was dropped 4434 */ 4435 int netif_receive_skb(struct sk_buff *skb) 4436 { 4437 trace_netif_receive_skb_entry(skb); 4438 4439 return netif_receive_skb_internal(skb); 4440 } 4441 EXPORT_SYMBOL(netif_receive_skb); 4442 4443 DEFINE_PER_CPU(struct work_struct, flush_works); 4444 4445 /* Network device is going away, flush any packets still pending */ 4446 static void flush_backlog(struct work_struct *work) 4447 { 4448 struct sk_buff *skb, *tmp; 4449 struct softnet_data *sd; 4450 4451 local_bh_disable(); 4452 sd = this_cpu_ptr(&softnet_data); 4453 4454 local_irq_disable(); 4455 rps_lock(sd); 4456 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4457 if (skb->dev->reg_state == NETREG_UNREGISTERING) { 4458 __skb_unlink(skb, &sd->input_pkt_queue); 4459 kfree_skb(skb); 4460 input_queue_head_incr(sd); 4461 } 4462 } 4463 rps_unlock(sd); 4464 local_irq_enable(); 4465 4466 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4467 if (skb->dev->reg_state == NETREG_UNREGISTERING) { 4468 __skb_unlink(skb, &sd->process_queue); 4469 kfree_skb(skb); 4470 input_queue_head_incr(sd); 4471 } 4472 } 4473 local_bh_enable(); 4474 } 4475 4476 static void flush_all_backlogs(void) 4477 { 4478 unsigned int cpu; 4479 4480 get_online_cpus(); 4481 4482 for_each_online_cpu(cpu) 4483 queue_work_on(cpu, system_highpri_wq, 4484 per_cpu_ptr(&flush_works, cpu)); 4485 4486 for_each_online_cpu(cpu) 4487 flush_work(per_cpu_ptr(&flush_works, cpu)); 4488 4489 put_online_cpus(); 4490 } 4491 4492 static int napi_gro_complete(struct sk_buff *skb) 4493 { 4494 struct packet_offload *ptype; 4495 __be16 type = skb->protocol; 4496 struct list_head *head = &offload_base; 4497 int err = -ENOENT; 4498 4499 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 4500 4501 if (NAPI_GRO_CB(skb)->count == 1) { 4502 skb_shinfo(skb)->gso_size = 0; 4503 goto out; 4504 } 4505 4506 rcu_read_lock(); 4507 list_for_each_entry_rcu(ptype, head, list) { 4508 if (ptype->type != type || !ptype->callbacks.gro_complete) 4509 continue; 4510 4511 err = ptype->callbacks.gro_complete(skb, 0); 4512 break; 4513 } 4514 rcu_read_unlock(); 4515 4516 if (err) { 4517 WARN_ON(&ptype->list == head); 4518 kfree_skb(skb); 4519 return NET_RX_SUCCESS; 4520 } 4521 4522 out: 4523 return netif_receive_skb_internal(skb); 4524 } 4525 4526 /* napi->gro_list contains packets ordered by age. 4527 * youngest packets at the head of it. 4528 * Complete skbs in reverse order to reduce latencies. 4529 */ 4530 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 4531 { 4532 struct sk_buff *skb, *prev = NULL; 4533 4534 /* scan list and build reverse chain */ 4535 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 4536 skb->prev = prev; 4537 prev = skb; 4538 } 4539 4540 for (skb = prev; skb; skb = prev) { 4541 skb->next = NULL; 4542 4543 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 4544 return; 4545 4546 prev = skb->prev; 4547 napi_gro_complete(skb); 4548 napi->gro_count--; 4549 } 4550 4551 napi->gro_list = NULL; 4552 } 4553 EXPORT_SYMBOL(napi_gro_flush); 4554 4555 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 4556 { 4557 struct sk_buff *p; 4558 unsigned int maclen = skb->dev->hard_header_len; 4559 u32 hash = skb_get_hash_raw(skb); 4560 4561 for (p = napi->gro_list; p; p = p->next) { 4562 unsigned long diffs; 4563 4564 NAPI_GRO_CB(p)->flush = 0; 4565 4566 if (hash != skb_get_hash_raw(p)) { 4567 NAPI_GRO_CB(p)->same_flow = 0; 4568 continue; 4569 } 4570 4571 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 4572 diffs |= p->vlan_tci ^ skb->vlan_tci; 4573 diffs |= skb_metadata_dst_cmp(p, skb); 4574 if (maclen == ETH_HLEN) 4575 diffs |= compare_ether_header(skb_mac_header(p), 4576 skb_mac_header(skb)); 4577 else if (!diffs) 4578 diffs = memcmp(skb_mac_header(p), 4579 skb_mac_header(skb), 4580 maclen); 4581 NAPI_GRO_CB(p)->same_flow = !diffs; 4582 } 4583 } 4584 4585 static void skb_gro_reset_offset(struct sk_buff *skb) 4586 { 4587 const struct skb_shared_info *pinfo = skb_shinfo(skb); 4588 const skb_frag_t *frag0 = &pinfo->frags[0]; 4589 4590 NAPI_GRO_CB(skb)->data_offset = 0; 4591 NAPI_GRO_CB(skb)->frag0 = NULL; 4592 NAPI_GRO_CB(skb)->frag0_len = 0; 4593 4594 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 4595 pinfo->nr_frags && 4596 !PageHighMem(skb_frag_page(frag0))) { 4597 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 4598 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, 4599 skb_frag_size(frag0), 4600 skb->end - skb->tail); 4601 } 4602 } 4603 4604 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 4605 { 4606 struct skb_shared_info *pinfo = skb_shinfo(skb); 4607 4608 BUG_ON(skb->end - skb->tail < grow); 4609 4610 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 4611 4612 skb->data_len -= grow; 4613 skb->tail += grow; 4614 4615 pinfo->frags[0].page_offset += grow; 4616 skb_frag_size_sub(&pinfo->frags[0], grow); 4617 4618 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 4619 skb_frag_unref(skb, 0); 4620 memmove(pinfo->frags, pinfo->frags + 1, 4621 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 4622 } 4623 } 4624 4625 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4626 { 4627 struct sk_buff **pp = NULL; 4628 struct packet_offload *ptype; 4629 __be16 type = skb->protocol; 4630 struct list_head *head = &offload_base; 4631 int same_flow; 4632 enum gro_result ret; 4633 int grow; 4634 4635 if (netif_elide_gro(skb->dev)) 4636 goto normal; 4637 4638 if (skb->csum_bad) 4639 goto normal; 4640 4641 gro_list_prepare(napi, skb); 4642 4643 rcu_read_lock(); 4644 list_for_each_entry_rcu(ptype, head, list) { 4645 if (ptype->type != type || !ptype->callbacks.gro_receive) 4646 continue; 4647 4648 skb_set_network_header(skb, skb_gro_offset(skb)); 4649 skb_reset_mac_len(skb); 4650 NAPI_GRO_CB(skb)->same_flow = 0; 4651 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); 4652 NAPI_GRO_CB(skb)->free = 0; 4653 NAPI_GRO_CB(skb)->encap_mark = 0; 4654 NAPI_GRO_CB(skb)->recursion_counter = 0; 4655 NAPI_GRO_CB(skb)->is_fou = 0; 4656 NAPI_GRO_CB(skb)->is_atomic = 1; 4657 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4658 4659 /* Setup for GRO checksum validation */ 4660 switch (skb->ip_summed) { 4661 case CHECKSUM_COMPLETE: 4662 NAPI_GRO_CB(skb)->csum = skb->csum; 4663 NAPI_GRO_CB(skb)->csum_valid = 1; 4664 NAPI_GRO_CB(skb)->csum_cnt = 0; 4665 break; 4666 case CHECKSUM_UNNECESSARY: 4667 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4668 NAPI_GRO_CB(skb)->csum_valid = 0; 4669 break; 4670 default: 4671 NAPI_GRO_CB(skb)->csum_cnt = 0; 4672 NAPI_GRO_CB(skb)->csum_valid = 0; 4673 } 4674 4675 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4676 break; 4677 } 4678 rcu_read_unlock(); 4679 4680 if (&ptype->list == head) 4681 goto normal; 4682 4683 if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) { 4684 ret = GRO_CONSUMED; 4685 goto ok; 4686 } 4687 4688 same_flow = NAPI_GRO_CB(skb)->same_flow; 4689 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4690 4691 if (pp) { 4692 struct sk_buff *nskb = *pp; 4693 4694 *pp = nskb->next; 4695 nskb->next = NULL; 4696 napi_gro_complete(nskb); 4697 napi->gro_count--; 4698 } 4699 4700 if (same_flow) 4701 goto ok; 4702 4703 if (NAPI_GRO_CB(skb)->flush) 4704 goto normal; 4705 4706 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4707 struct sk_buff *nskb = napi->gro_list; 4708 4709 /* locate the end of the list to select the 'oldest' flow */ 4710 while (nskb->next) { 4711 pp = &nskb->next; 4712 nskb = *pp; 4713 } 4714 *pp = NULL; 4715 nskb->next = NULL; 4716 napi_gro_complete(nskb); 4717 } else { 4718 napi->gro_count++; 4719 } 4720 NAPI_GRO_CB(skb)->count = 1; 4721 NAPI_GRO_CB(skb)->age = jiffies; 4722 NAPI_GRO_CB(skb)->last = skb; 4723 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4724 skb->next = napi->gro_list; 4725 napi->gro_list = skb; 4726 ret = GRO_HELD; 4727 4728 pull: 4729 grow = skb_gro_offset(skb) - skb_headlen(skb); 4730 if (grow > 0) 4731 gro_pull_from_frag0(skb, grow); 4732 ok: 4733 return ret; 4734 4735 normal: 4736 ret = GRO_NORMAL; 4737 goto pull; 4738 } 4739 4740 struct packet_offload *gro_find_receive_by_type(__be16 type) 4741 { 4742 struct list_head *offload_head = &offload_base; 4743 struct packet_offload *ptype; 4744 4745 list_for_each_entry_rcu(ptype, offload_head, list) { 4746 if (ptype->type != type || !ptype->callbacks.gro_receive) 4747 continue; 4748 return ptype; 4749 } 4750 return NULL; 4751 } 4752 EXPORT_SYMBOL(gro_find_receive_by_type); 4753 4754 struct packet_offload *gro_find_complete_by_type(__be16 type) 4755 { 4756 struct list_head *offload_head = &offload_base; 4757 struct packet_offload *ptype; 4758 4759 list_for_each_entry_rcu(ptype, offload_head, list) { 4760 if (ptype->type != type || !ptype->callbacks.gro_complete) 4761 continue; 4762 return ptype; 4763 } 4764 return NULL; 4765 } 4766 EXPORT_SYMBOL(gro_find_complete_by_type); 4767 4768 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4769 { 4770 switch (ret) { 4771 case GRO_NORMAL: 4772 if (netif_receive_skb_internal(skb)) 4773 ret = GRO_DROP; 4774 break; 4775 4776 case GRO_DROP: 4777 kfree_skb(skb); 4778 break; 4779 4780 case GRO_MERGED_FREE: 4781 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { 4782 skb_dst_drop(skb); 4783 secpath_reset(skb); 4784 kmem_cache_free(skbuff_head_cache, skb); 4785 } else { 4786 __kfree_skb(skb); 4787 } 4788 break; 4789 4790 case GRO_HELD: 4791 case GRO_MERGED: 4792 case GRO_CONSUMED: 4793 break; 4794 } 4795 4796 return ret; 4797 } 4798 4799 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4800 { 4801 skb_mark_napi_id(skb, napi); 4802 trace_napi_gro_receive_entry(skb); 4803 4804 skb_gro_reset_offset(skb); 4805 4806 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4807 } 4808 EXPORT_SYMBOL(napi_gro_receive); 4809 4810 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4811 { 4812 if (unlikely(skb->pfmemalloc)) { 4813 consume_skb(skb); 4814 return; 4815 } 4816 __skb_pull(skb, skb_headlen(skb)); 4817 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4818 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4819 skb->vlan_tci = 0; 4820 skb->dev = napi->dev; 4821 skb->skb_iif = 0; 4822 skb->encapsulation = 0; 4823 skb_shinfo(skb)->gso_type = 0; 4824 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4825 secpath_reset(skb); 4826 4827 napi->skb = skb; 4828 } 4829 4830 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4831 { 4832 struct sk_buff *skb = napi->skb; 4833 4834 if (!skb) { 4835 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4836 if (skb) { 4837 napi->skb = skb; 4838 skb_mark_napi_id(skb, napi); 4839 } 4840 } 4841 return skb; 4842 } 4843 EXPORT_SYMBOL(napi_get_frags); 4844 4845 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4846 struct sk_buff *skb, 4847 gro_result_t ret) 4848 { 4849 switch (ret) { 4850 case GRO_NORMAL: 4851 case GRO_HELD: 4852 __skb_push(skb, ETH_HLEN); 4853 skb->protocol = eth_type_trans(skb, skb->dev); 4854 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4855 ret = GRO_DROP; 4856 break; 4857 4858 case GRO_DROP: 4859 case GRO_MERGED_FREE: 4860 napi_reuse_skb(napi, skb); 4861 break; 4862 4863 case GRO_MERGED: 4864 case GRO_CONSUMED: 4865 break; 4866 } 4867 4868 return ret; 4869 } 4870 4871 /* Upper GRO stack assumes network header starts at gro_offset=0 4872 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4873 * We copy ethernet header into skb->data to have a common layout. 4874 */ 4875 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4876 { 4877 struct sk_buff *skb = napi->skb; 4878 const struct ethhdr *eth; 4879 unsigned int hlen = sizeof(*eth); 4880 4881 napi->skb = NULL; 4882 4883 skb_reset_mac_header(skb); 4884 skb_gro_reset_offset(skb); 4885 4886 eth = skb_gro_header_fast(skb, 0); 4887 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4888 eth = skb_gro_header_slow(skb, hlen, 0); 4889 if (unlikely(!eth)) { 4890 net_warn_ratelimited("%s: dropping impossible skb from %s\n", 4891 __func__, napi->dev->name); 4892 napi_reuse_skb(napi, skb); 4893 return NULL; 4894 } 4895 } else { 4896 gro_pull_from_frag0(skb, hlen); 4897 NAPI_GRO_CB(skb)->frag0 += hlen; 4898 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4899 } 4900 __skb_pull(skb, hlen); 4901 4902 /* 4903 * This works because the only protocols we care about don't require 4904 * special handling. 4905 * We'll fix it up properly in napi_frags_finish() 4906 */ 4907 skb->protocol = eth->h_proto; 4908 4909 return skb; 4910 } 4911 4912 gro_result_t napi_gro_frags(struct napi_struct *napi) 4913 { 4914 struct sk_buff *skb = napi_frags_skb(napi); 4915 4916 if (!skb) 4917 return GRO_DROP; 4918 4919 trace_napi_gro_frags_entry(skb); 4920 4921 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4922 } 4923 EXPORT_SYMBOL(napi_gro_frags); 4924 4925 /* Compute the checksum from gro_offset and return the folded value 4926 * after adding in any pseudo checksum. 4927 */ 4928 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4929 { 4930 __wsum wsum; 4931 __sum16 sum; 4932 4933 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4934 4935 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4936 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4937 if (likely(!sum)) { 4938 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4939 !skb->csum_complete_sw) 4940 netdev_rx_csum_fault(skb->dev); 4941 } 4942 4943 NAPI_GRO_CB(skb)->csum = wsum; 4944 NAPI_GRO_CB(skb)->csum_valid = 1; 4945 4946 return sum; 4947 } 4948 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4949 4950 /* 4951 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4952 * Note: called with local irq disabled, but exits with local irq enabled. 4953 */ 4954 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4955 { 4956 #ifdef CONFIG_RPS 4957 struct softnet_data *remsd = sd->rps_ipi_list; 4958 4959 if (remsd) { 4960 sd->rps_ipi_list = NULL; 4961 4962 local_irq_enable(); 4963 4964 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4965 while (remsd) { 4966 struct softnet_data *next = remsd->rps_ipi_next; 4967 4968 if (cpu_online(remsd->cpu)) 4969 smp_call_function_single_async(remsd->cpu, 4970 &remsd->csd); 4971 remsd = next; 4972 } 4973 } else 4974 #endif 4975 local_irq_enable(); 4976 } 4977 4978 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4979 { 4980 #ifdef CONFIG_RPS 4981 return sd->rps_ipi_list != NULL; 4982 #else 4983 return false; 4984 #endif 4985 } 4986 4987 static int process_backlog(struct napi_struct *napi, int quota) 4988 { 4989 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4990 bool again = true; 4991 int work = 0; 4992 4993 /* Check if we have pending ipi, its better to send them now, 4994 * not waiting net_rx_action() end. 4995 */ 4996 if (sd_has_rps_ipi_waiting(sd)) { 4997 local_irq_disable(); 4998 net_rps_action_and_irq_enable(sd); 4999 } 5000 5001 napi->weight = dev_rx_weight; 5002 while (again) { 5003 struct sk_buff *skb; 5004 5005 while ((skb = __skb_dequeue(&sd->process_queue))) { 5006 rcu_read_lock(); 5007 __netif_receive_skb(skb); 5008 rcu_read_unlock(); 5009 input_queue_head_incr(sd); 5010 if (++work >= quota) 5011 return work; 5012 5013 } 5014 5015 local_irq_disable(); 5016 rps_lock(sd); 5017 if (skb_queue_empty(&sd->input_pkt_queue)) { 5018 /* 5019 * Inline a custom version of __napi_complete(). 5020 * only current cpu owns and manipulates this napi, 5021 * and NAPI_STATE_SCHED is the only possible flag set 5022 * on backlog. 5023 * We can use a plain write instead of clear_bit(), 5024 * and we dont need an smp_mb() memory barrier. 5025 */ 5026 napi->state = 0; 5027 again = false; 5028 } else { 5029 skb_queue_splice_tail_init(&sd->input_pkt_queue, 5030 &sd->process_queue); 5031 } 5032 rps_unlock(sd); 5033 local_irq_enable(); 5034 } 5035 5036 return work; 5037 } 5038 5039 /** 5040 * __napi_schedule - schedule for receive 5041 * @n: entry to schedule 5042 * 5043 * The entry's receive function will be scheduled to run. 5044 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 5045 */ 5046 void __napi_schedule(struct napi_struct *n) 5047 { 5048 unsigned long flags; 5049 5050 local_irq_save(flags); 5051 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 5052 local_irq_restore(flags); 5053 } 5054 EXPORT_SYMBOL(__napi_schedule); 5055 5056 /** 5057 * napi_schedule_prep - check if napi can be scheduled 5058 * @n: napi context 5059 * 5060 * Test if NAPI routine is already running, and if not mark 5061 * it as running. This is used as a condition variable 5062 * insure only one NAPI poll instance runs. We also make 5063 * sure there is no pending NAPI disable. 5064 */ 5065 bool napi_schedule_prep(struct napi_struct *n) 5066 { 5067 unsigned long val, new; 5068 5069 do { 5070 val = READ_ONCE(n->state); 5071 if (unlikely(val & NAPIF_STATE_DISABLE)) 5072 return false; 5073 new = val | NAPIF_STATE_SCHED; 5074 5075 /* Sets STATE_MISSED bit if STATE_SCHED was already set 5076 * This was suggested by Alexander Duyck, as compiler 5077 * emits better code than : 5078 * if (val & NAPIF_STATE_SCHED) 5079 * new |= NAPIF_STATE_MISSED; 5080 */ 5081 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * 5082 NAPIF_STATE_MISSED; 5083 } while (cmpxchg(&n->state, val, new) != val); 5084 5085 return !(val & NAPIF_STATE_SCHED); 5086 } 5087 EXPORT_SYMBOL(napi_schedule_prep); 5088 5089 /** 5090 * __napi_schedule_irqoff - schedule for receive 5091 * @n: entry to schedule 5092 * 5093 * Variant of __napi_schedule() assuming hard irqs are masked 5094 */ 5095 void __napi_schedule_irqoff(struct napi_struct *n) 5096 { 5097 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 5098 } 5099 EXPORT_SYMBOL(__napi_schedule_irqoff); 5100 5101 bool napi_complete_done(struct napi_struct *n, int work_done) 5102 { 5103 unsigned long flags, val, new; 5104 5105 /* 5106 * 1) Don't let napi dequeue from the cpu poll list 5107 * just in case its running on a different cpu. 5108 * 2) If we are busy polling, do nothing here, we have 5109 * the guarantee we will be called later. 5110 */ 5111 if (unlikely(n->state & (NAPIF_STATE_NPSVC | 5112 NAPIF_STATE_IN_BUSY_POLL))) 5113 return false; 5114 5115 if (n->gro_list) { 5116 unsigned long timeout = 0; 5117 5118 if (work_done) 5119 timeout = n->dev->gro_flush_timeout; 5120 5121 if (timeout) 5122 hrtimer_start(&n->timer, ns_to_ktime(timeout), 5123 HRTIMER_MODE_REL_PINNED); 5124 else 5125 napi_gro_flush(n, false); 5126 } 5127 if (unlikely(!list_empty(&n->poll_list))) { 5128 /* If n->poll_list is not empty, we need to mask irqs */ 5129 local_irq_save(flags); 5130 list_del_init(&n->poll_list); 5131 local_irq_restore(flags); 5132 } 5133 5134 do { 5135 val = READ_ONCE(n->state); 5136 5137 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); 5138 5139 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); 5140 5141 /* If STATE_MISSED was set, leave STATE_SCHED set, 5142 * because we will call napi->poll() one more time. 5143 * This C code was suggested by Alexander Duyck to help gcc. 5144 */ 5145 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * 5146 NAPIF_STATE_SCHED; 5147 } while (cmpxchg(&n->state, val, new) != val); 5148 5149 if (unlikely(val & NAPIF_STATE_MISSED)) { 5150 __napi_schedule(n); 5151 return false; 5152 } 5153 5154 return true; 5155 } 5156 EXPORT_SYMBOL(napi_complete_done); 5157 5158 /* must be called under rcu_read_lock(), as we dont take a reference */ 5159 static struct napi_struct *napi_by_id(unsigned int napi_id) 5160 { 5161 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 5162 struct napi_struct *napi; 5163 5164 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 5165 if (napi->napi_id == napi_id) 5166 return napi; 5167 5168 return NULL; 5169 } 5170 5171 #if defined(CONFIG_NET_RX_BUSY_POLL) 5172 5173 #define BUSY_POLL_BUDGET 8 5174 5175 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) 5176 { 5177 int rc; 5178 5179 /* Busy polling means there is a high chance device driver hard irq 5180 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was 5181 * set in napi_schedule_prep(). 5182 * Since we are about to call napi->poll() once more, we can safely 5183 * clear NAPI_STATE_MISSED. 5184 * 5185 * Note: x86 could use a single "lock and ..." instruction 5186 * to perform these two clear_bit() 5187 */ 5188 clear_bit(NAPI_STATE_MISSED, &napi->state); 5189 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); 5190 5191 local_bh_disable(); 5192 5193 /* All we really want here is to re-enable device interrupts. 5194 * Ideally, a new ndo_busy_poll_stop() could avoid another round. 5195 */ 5196 rc = napi->poll(napi, BUSY_POLL_BUDGET); 5197 netpoll_poll_unlock(have_poll_lock); 5198 if (rc == BUSY_POLL_BUDGET) 5199 __napi_schedule(napi); 5200 local_bh_enable(); 5201 if (local_softirq_pending()) 5202 do_softirq(); 5203 } 5204 5205 void napi_busy_loop(unsigned int napi_id, 5206 bool (*loop_end)(void *, unsigned long), 5207 void *loop_end_arg) 5208 { 5209 unsigned long start_time = loop_end ? busy_loop_current_time() : 0; 5210 int (*napi_poll)(struct napi_struct *napi, int budget); 5211 void *have_poll_lock = NULL; 5212 struct napi_struct *napi; 5213 5214 restart: 5215 napi_poll = NULL; 5216 5217 rcu_read_lock(); 5218 5219 napi = napi_by_id(napi_id); 5220 if (!napi) 5221 goto out; 5222 5223 preempt_disable(); 5224 for (;;) { 5225 int work = 0; 5226 5227 local_bh_disable(); 5228 if (!napi_poll) { 5229 unsigned long val = READ_ONCE(napi->state); 5230 5231 /* If multiple threads are competing for this napi, 5232 * we avoid dirtying napi->state as much as we can. 5233 */ 5234 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | 5235 NAPIF_STATE_IN_BUSY_POLL)) 5236 goto count; 5237 if (cmpxchg(&napi->state, val, 5238 val | NAPIF_STATE_IN_BUSY_POLL | 5239 NAPIF_STATE_SCHED) != val) 5240 goto count; 5241 have_poll_lock = netpoll_poll_lock(napi); 5242 napi_poll = napi->poll; 5243 } 5244 work = napi_poll(napi, BUSY_POLL_BUDGET); 5245 trace_napi_poll(napi, work, BUSY_POLL_BUDGET); 5246 count: 5247 if (work > 0) 5248 __NET_ADD_STATS(dev_net(napi->dev), 5249 LINUX_MIB_BUSYPOLLRXPACKETS, work); 5250 local_bh_enable(); 5251 5252 if (!loop_end || loop_end(loop_end_arg, start_time)) 5253 break; 5254 5255 if (unlikely(need_resched())) { 5256 if (napi_poll) 5257 busy_poll_stop(napi, have_poll_lock); 5258 preempt_enable(); 5259 rcu_read_unlock(); 5260 cond_resched(); 5261 if (loop_end(loop_end_arg, start_time)) 5262 return; 5263 goto restart; 5264 } 5265 cpu_relax(); 5266 } 5267 if (napi_poll) 5268 busy_poll_stop(napi, have_poll_lock); 5269 preempt_enable(); 5270 out: 5271 rcu_read_unlock(); 5272 } 5273 EXPORT_SYMBOL(napi_busy_loop); 5274 5275 #endif /* CONFIG_NET_RX_BUSY_POLL */ 5276 5277 static void napi_hash_add(struct napi_struct *napi) 5278 { 5279 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || 5280 test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) 5281 return; 5282 5283 spin_lock(&napi_hash_lock); 5284 5285 /* 0..NR_CPUS range is reserved for sender_cpu use */ 5286 do { 5287 if (unlikely(++napi_gen_id < MIN_NAPI_ID)) 5288 napi_gen_id = MIN_NAPI_ID; 5289 } while (napi_by_id(napi_gen_id)); 5290 napi->napi_id = napi_gen_id; 5291 5292 hlist_add_head_rcu(&napi->napi_hash_node, 5293 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 5294 5295 spin_unlock(&napi_hash_lock); 5296 } 5297 5298 /* Warning : caller is responsible to make sure rcu grace period 5299 * is respected before freeing memory containing @napi 5300 */ 5301 bool napi_hash_del(struct napi_struct *napi) 5302 { 5303 bool rcu_sync_needed = false; 5304 5305 spin_lock(&napi_hash_lock); 5306 5307 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { 5308 rcu_sync_needed = true; 5309 hlist_del_rcu(&napi->napi_hash_node); 5310 } 5311 spin_unlock(&napi_hash_lock); 5312 return rcu_sync_needed; 5313 } 5314 EXPORT_SYMBOL_GPL(napi_hash_del); 5315 5316 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 5317 { 5318 struct napi_struct *napi; 5319 5320 napi = container_of(timer, struct napi_struct, timer); 5321 5322 /* Note : we use a relaxed variant of napi_schedule_prep() not setting 5323 * NAPI_STATE_MISSED, since we do not react to a device IRQ. 5324 */ 5325 if (napi->gro_list && !napi_disable_pending(napi) && 5326 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) 5327 __napi_schedule_irqoff(napi); 5328 5329 return HRTIMER_NORESTART; 5330 } 5331 5332 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 5333 int (*poll)(struct napi_struct *, int), int weight) 5334 { 5335 INIT_LIST_HEAD(&napi->poll_list); 5336 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 5337 napi->timer.function = napi_watchdog; 5338 napi->gro_count = 0; 5339 napi->gro_list = NULL; 5340 napi->skb = NULL; 5341 napi->poll = poll; 5342 if (weight > NAPI_POLL_WEIGHT) 5343 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 5344 weight, dev->name); 5345 napi->weight = weight; 5346 list_add(&napi->dev_list, &dev->napi_list); 5347 napi->dev = dev; 5348 #ifdef CONFIG_NETPOLL 5349 napi->poll_owner = -1; 5350 #endif 5351 set_bit(NAPI_STATE_SCHED, &napi->state); 5352 napi_hash_add(napi); 5353 } 5354 EXPORT_SYMBOL(netif_napi_add); 5355 5356 void napi_disable(struct napi_struct *n) 5357 { 5358 might_sleep(); 5359 set_bit(NAPI_STATE_DISABLE, &n->state); 5360 5361 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 5362 msleep(1); 5363 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) 5364 msleep(1); 5365 5366 hrtimer_cancel(&n->timer); 5367 5368 clear_bit(NAPI_STATE_DISABLE, &n->state); 5369 } 5370 EXPORT_SYMBOL(napi_disable); 5371 5372 /* Must be called in process context */ 5373 void netif_napi_del(struct napi_struct *napi) 5374 { 5375 might_sleep(); 5376 if (napi_hash_del(napi)) 5377 synchronize_net(); 5378 list_del_init(&napi->dev_list); 5379 napi_free_frags(napi); 5380 5381 kfree_skb_list(napi->gro_list); 5382 napi->gro_list = NULL; 5383 napi->gro_count = 0; 5384 } 5385 EXPORT_SYMBOL(netif_napi_del); 5386 5387 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 5388 { 5389 void *have; 5390 int work, weight; 5391 5392 list_del_init(&n->poll_list); 5393 5394 have = netpoll_poll_lock(n); 5395 5396 weight = n->weight; 5397 5398 /* This NAPI_STATE_SCHED test is for avoiding a race 5399 * with netpoll's poll_napi(). Only the entity which 5400 * obtains the lock and sees NAPI_STATE_SCHED set will 5401 * actually make the ->poll() call. Therefore we avoid 5402 * accidentally calling ->poll() when NAPI is not scheduled. 5403 */ 5404 work = 0; 5405 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 5406 work = n->poll(n, weight); 5407 trace_napi_poll(n, work, weight); 5408 } 5409 5410 WARN_ON_ONCE(work > weight); 5411 5412 if (likely(work < weight)) 5413 goto out_unlock; 5414 5415 /* Drivers must not modify the NAPI state if they 5416 * consume the entire weight. In such cases this code 5417 * still "owns" the NAPI instance and therefore can 5418 * move the instance around on the list at-will. 5419 */ 5420 if (unlikely(napi_disable_pending(n))) { 5421 napi_complete(n); 5422 goto out_unlock; 5423 } 5424 5425 if (n->gro_list) { 5426 /* flush too old packets 5427 * If HZ < 1000, flush all packets. 5428 */ 5429 napi_gro_flush(n, HZ >= 1000); 5430 } 5431 5432 /* Some drivers may have called napi_schedule 5433 * prior to exhausting their budget. 5434 */ 5435 if (unlikely(!list_empty(&n->poll_list))) { 5436 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 5437 n->dev ? n->dev->name : "backlog"); 5438 goto out_unlock; 5439 } 5440 5441 list_add_tail(&n->poll_list, repoll); 5442 5443 out_unlock: 5444 netpoll_poll_unlock(have); 5445 5446 return work; 5447 } 5448 5449 static __latent_entropy void net_rx_action(struct softirq_action *h) 5450 { 5451 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 5452 unsigned long time_limit = jiffies + 5453 usecs_to_jiffies(netdev_budget_usecs); 5454 int budget = netdev_budget; 5455 LIST_HEAD(list); 5456 LIST_HEAD(repoll); 5457 5458 local_irq_disable(); 5459 list_splice_init(&sd->poll_list, &list); 5460 local_irq_enable(); 5461 5462 for (;;) { 5463 struct napi_struct *n; 5464 5465 if (list_empty(&list)) { 5466 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 5467 goto out; 5468 break; 5469 } 5470 5471 n = list_first_entry(&list, struct napi_struct, poll_list); 5472 budget -= napi_poll(n, &repoll); 5473 5474 /* If softirq window is exhausted then punt. 5475 * Allow this to run for 2 jiffies since which will allow 5476 * an average latency of 1.5/HZ. 5477 */ 5478 if (unlikely(budget <= 0 || 5479 time_after_eq(jiffies, time_limit))) { 5480 sd->time_squeeze++; 5481 break; 5482 } 5483 } 5484 5485 local_irq_disable(); 5486 5487 list_splice_tail_init(&sd->poll_list, &list); 5488 list_splice_tail(&repoll, &list); 5489 list_splice(&list, &sd->poll_list); 5490 if (!list_empty(&sd->poll_list)) 5491 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 5492 5493 net_rps_action_and_irq_enable(sd); 5494 out: 5495 __kfree_skb_flush(); 5496 } 5497 5498 struct netdev_adjacent { 5499 struct net_device *dev; 5500 5501 /* upper master flag, there can only be one master device per list */ 5502 bool master; 5503 5504 /* counter for the number of times this device was added to us */ 5505 u16 ref_nr; 5506 5507 /* private field for the users */ 5508 void *private; 5509 5510 struct list_head list; 5511 struct rcu_head rcu; 5512 }; 5513 5514 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, 5515 struct list_head *adj_list) 5516 { 5517 struct netdev_adjacent *adj; 5518 5519 list_for_each_entry(adj, adj_list, list) { 5520 if (adj->dev == adj_dev) 5521 return adj; 5522 } 5523 return NULL; 5524 } 5525 5526 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) 5527 { 5528 struct net_device *dev = data; 5529 5530 return upper_dev == dev; 5531 } 5532 5533 /** 5534 * netdev_has_upper_dev - Check if device is linked to an upper device 5535 * @dev: device 5536 * @upper_dev: upper device to check 5537 * 5538 * Find out if a device is linked to specified upper device and return true 5539 * in case it is. Note that this checks only immediate upper device, 5540 * not through a complete stack of devices. The caller must hold the RTNL lock. 5541 */ 5542 bool netdev_has_upper_dev(struct net_device *dev, 5543 struct net_device *upper_dev) 5544 { 5545 ASSERT_RTNL(); 5546 5547 return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, 5548 upper_dev); 5549 } 5550 EXPORT_SYMBOL(netdev_has_upper_dev); 5551 5552 /** 5553 * netdev_has_upper_dev_all - Check if device is linked to an upper device 5554 * @dev: device 5555 * @upper_dev: upper device to check 5556 * 5557 * Find out if a device is linked to specified upper device and return true 5558 * in case it is. Note that this checks the entire upper device chain. 5559 * The caller must hold rcu lock. 5560 */ 5561 5562 bool netdev_has_upper_dev_all_rcu(struct net_device *dev, 5563 struct net_device *upper_dev) 5564 { 5565 return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, 5566 upper_dev); 5567 } 5568 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); 5569 5570 /** 5571 * netdev_has_any_upper_dev - Check if device is linked to some device 5572 * @dev: device 5573 * 5574 * Find out if a device is linked to an upper device and return true in case 5575 * it is. The caller must hold the RTNL lock. 5576 */ 5577 static bool netdev_has_any_upper_dev(struct net_device *dev) 5578 { 5579 ASSERT_RTNL(); 5580 5581 return !list_empty(&dev->adj_list.upper); 5582 } 5583 5584 /** 5585 * netdev_master_upper_dev_get - Get master upper device 5586 * @dev: device 5587 * 5588 * Find a master upper device and return pointer to it or NULL in case 5589 * it's not there. The caller must hold the RTNL lock. 5590 */ 5591 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 5592 { 5593 struct netdev_adjacent *upper; 5594 5595 ASSERT_RTNL(); 5596 5597 if (list_empty(&dev->adj_list.upper)) 5598 return NULL; 5599 5600 upper = list_first_entry(&dev->adj_list.upper, 5601 struct netdev_adjacent, list); 5602 if (likely(upper->master)) 5603 return upper->dev; 5604 return NULL; 5605 } 5606 EXPORT_SYMBOL(netdev_master_upper_dev_get); 5607 5608 /** 5609 * netdev_has_any_lower_dev - Check if device is linked to some device 5610 * @dev: device 5611 * 5612 * Find out if a device is linked to a lower device and return true in case 5613 * it is. The caller must hold the RTNL lock. 5614 */ 5615 static bool netdev_has_any_lower_dev(struct net_device *dev) 5616 { 5617 ASSERT_RTNL(); 5618 5619 return !list_empty(&dev->adj_list.lower); 5620 } 5621 5622 void *netdev_adjacent_get_private(struct list_head *adj_list) 5623 { 5624 struct netdev_adjacent *adj; 5625 5626 adj = list_entry(adj_list, struct netdev_adjacent, list); 5627 5628 return adj->private; 5629 } 5630 EXPORT_SYMBOL(netdev_adjacent_get_private); 5631 5632 /** 5633 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 5634 * @dev: device 5635 * @iter: list_head ** of the current position 5636 * 5637 * Gets the next device from the dev's upper list, starting from iter 5638 * position. The caller must hold RCU read lock. 5639 */ 5640 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 5641 struct list_head **iter) 5642 { 5643 struct netdev_adjacent *upper; 5644 5645 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5646 5647 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5648 5649 if (&upper->list == &dev->adj_list.upper) 5650 return NULL; 5651 5652 *iter = &upper->list; 5653 5654 return upper->dev; 5655 } 5656 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 5657 5658 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, 5659 struct list_head **iter) 5660 { 5661 struct netdev_adjacent *upper; 5662 5663 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5664 5665 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5666 5667 if (&upper->list == &dev->adj_list.upper) 5668 return NULL; 5669 5670 *iter = &upper->list; 5671 5672 return upper->dev; 5673 } 5674 5675 int netdev_walk_all_upper_dev_rcu(struct net_device *dev, 5676 int (*fn)(struct net_device *dev, 5677 void *data), 5678 void *data) 5679 { 5680 struct net_device *udev; 5681 struct list_head *iter; 5682 int ret; 5683 5684 for (iter = &dev->adj_list.upper, 5685 udev = netdev_next_upper_dev_rcu(dev, &iter); 5686 udev; 5687 udev = netdev_next_upper_dev_rcu(dev, &iter)) { 5688 /* first is the upper device itself */ 5689 ret = fn(udev, data); 5690 if (ret) 5691 return ret; 5692 5693 /* then look at all of its upper devices */ 5694 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data); 5695 if (ret) 5696 return ret; 5697 } 5698 5699 return 0; 5700 } 5701 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); 5702 5703 /** 5704 * netdev_lower_get_next_private - Get the next ->private from the 5705 * lower neighbour list 5706 * @dev: device 5707 * @iter: list_head ** of the current position 5708 * 5709 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5710 * list, starting from iter position. The caller must hold either hold the 5711 * RTNL lock or its own locking that guarantees that the neighbour lower 5712 * list will remain unchanged. 5713 */ 5714 void *netdev_lower_get_next_private(struct net_device *dev, 5715 struct list_head **iter) 5716 { 5717 struct netdev_adjacent *lower; 5718 5719 lower = list_entry(*iter, struct netdev_adjacent, list); 5720 5721 if (&lower->list == &dev->adj_list.lower) 5722 return NULL; 5723 5724 *iter = lower->list.next; 5725 5726 return lower->private; 5727 } 5728 EXPORT_SYMBOL(netdev_lower_get_next_private); 5729 5730 /** 5731 * netdev_lower_get_next_private_rcu - Get the next ->private from the 5732 * lower neighbour list, RCU 5733 * variant 5734 * @dev: device 5735 * @iter: list_head ** of the current position 5736 * 5737 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5738 * list, starting from iter position. The caller must hold RCU read lock. 5739 */ 5740 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 5741 struct list_head **iter) 5742 { 5743 struct netdev_adjacent *lower; 5744 5745 WARN_ON_ONCE(!rcu_read_lock_held()); 5746 5747 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5748 5749 if (&lower->list == &dev->adj_list.lower) 5750 return NULL; 5751 5752 *iter = &lower->list; 5753 5754 return lower->private; 5755 } 5756 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 5757 5758 /** 5759 * netdev_lower_get_next - Get the next device from the lower neighbour 5760 * list 5761 * @dev: device 5762 * @iter: list_head ** of the current position 5763 * 5764 * Gets the next netdev_adjacent from the dev's lower neighbour 5765 * list, starting from iter position. The caller must hold RTNL lock or 5766 * its own locking that guarantees that the neighbour lower 5767 * list will remain unchanged. 5768 */ 5769 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 5770 { 5771 struct netdev_adjacent *lower; 5772 5773 lower = list_entry(*iter, struct netdev_adjacent, list); 5774 5775 if (&lower->list == &dev->adj_list.lower) 5776 return NULL; 5777 5778 *iter = lower->list.next; 5779 5780 return lower->dev; 5781 } 5782 EXPORT_SYMBOL(netdev_lower_get_next); 5783 5784 static struct net_device *netdev_next_lower_dev(struct net_device *dev, 5785 struct list_head **iter) 5786 { 5787 struct netdev_adjacent *lower; 5788 5789 lower = list_entry((*iter)->next, struct netdev_adjacent, list); 5790 5791 if (&lower->list == &dev->adj_list.lower) 5792 return NULL; 5793 5794 *iter = &lower->list; 5795 5796 return lower->dev; 5797 } 5798 5799 int netdev_walk_all_lower_dev(struct net_device *dev, 5800 int (*fn)(struct net_device *dev, 5801 void *data), 5802 void *data) 5803 { 5804 struct net_device *ldev; 5805 struct list_head *iter; 5806 int ret; 5807 5808 for (iter = &dev->adj_list.lower, 5809 ldev = netdev_next_lower_dev(dev, &iter); 5810 ldev; 5811 ldev = netdev_next_lower_dev(dev, &iter)) { 5812 /* first is the lower device itself */ 5813 ret = fn(ldev, data); 5814 if (ret) 5815 return ret; 5816 5817 /* then look at all of its lower devices */ 5818 ret = netdev_walk_all_lower_dev(ldev, fn, data); 5819 if (ret) 5820 return ret; 5821 } 5822 5823 return 0; 5824 } 5825 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); 5826 5827 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, 5828 struct list_head **iter) 5829 { 5830 struct netdev_adjacent *lower; 5831 5832 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5833 if (&lower->list == &dev->adj_list.lower) 5834 return NULL; 5835 5836 *iter = &lower->list; 5837 5838 return lower->dev; 5839 } 5840 5841 int netdev_walk_all_lower_dev_rcu(struct net_device *dev, 5842 int (*fn)(struct net_device *dev, 5843 void *data), 5844 void *data) 5845 { 5846 struct net_device *ldev; 5847 struct list_head *iter; 5848 int ret; 5849 5850 for (iter = &dev->adj_list.lower, 5851 ldev = netdev_next_lower_dev_rcu(dev, &iter); 5852 ldev; 5853 ldev = netdev_next_lower_dev_rcu(dev, &iter)) { 5854 /* first is the lower device itself */ 5855 ret = fn(ldev, data); 5856 if (ret) 5857 return ret; 5858 5859 /* then look at all of its lower devices */ 5860 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data); 5861 if (ret) 5862 return ret; 5863 } 5864 5865 return 0; 5866 } 5867 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); 5868 5869 /** 5870 * netdev_lower_get_first_private_rcu - Get the first ->private from the 5871 * lower neighbour list, RCU 5872 * variant 5873 * @dev: device 5874 * 5875 * Gets the first netdev_adjacent->private from the dev's lower neighbour 5876 * list. The caller must hold RCU read lock. 5877 */ 5878 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 5879 { 5880 struct netdev_adjacent *lower; 5881 5882 lower = list_first_or_null_rcu(&dev->adj_list.lower, 5883 struct netdev_adjacent, list); 5884 if (lower) 5885 return lower->private; 5886 return NULL; 5887 } 5888 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 5889 5890 /** 5891 * netdev_master_upper_dev_get_rcu - Get master upper device 5892 * @dev: device 5893 * 5894 * Find a master upper device and return pointer to it or NULL in case 5895 * it's not there. The caller must hold the RCU read lock. 5896 */ 5897 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 5898 { 5899 struct netdev_adjacent *upper; 5900 5901 upper = list_first_or_null_rcu(&dev->adj_list.upper, 5902 struct netdev_adjacent, list); 5903 if (upper && likely(upper->master)) 5904 return upper->dev; 5905 return NULL; 5906 } 5907 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 5908 5909 static int netdev_adjacent_sysfs_add(struct net_device *dev, 5910 struct net_device *adj_dev, 5911 struct list_head *dev_list) 5912 { 5913 char linkname[IFNAMSIZ+7]; 5914 5915 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5916 "upper_%s" : "lower_%s", adj_dev->name); 5917 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5918 linkname); 5919 } 5920 static void netdev_adjacent_sysfs_del(struct net_device *dev, 5921 char *name, 5922 struct list_head *dev_list) 5923 { 5924 char linkname[IFNAMSIZ+7]; 5925 5926 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5927 "upper_%s" : "lower_%s", name); 5928 sysfs_remove_link(&(dev->dev.kobj), linkname); 5929 } 5930 5931 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 5932 struct net_device *adj_dev, 5933 struct list_head *dev_list) 5934 { 5935 return (dev_list == &dev->adj_list.upper || 5936 dev_list == &dev->adj_list.lower) && 5937 net_eq(dev_net(dev), dev_net(adj_dev)); 5938 } 5939 5940 static int __netdev_adjacent_dev_insert(struct net_device *dev, 5941 struct net_device *adj_dev, 5942 struct list_head *dev_list, 5943 void *private, bool master) 5944 { 5945 struct netdev_adjacent *adj; 5946 int ret; 5947 5948 adj = __netdev_find_adj(adj_dev, dev_list); 5949 5950 if (adj) { 5951 adj->ref_nr += 1; 5952 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", 5953 dev->name, adj_dev->name, adj->ref_nr); 5954 5955 return 0; 5956 } 5957 5958 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5959 if (!adj) 5960 return -ENOMEM; 5961 5962 adj->dev = adj_dev; 5963 adj->master = master; 5964 adj->ref_nr = 1; 5965 adj->private = private; 5966 dev_hold(adj_dev); 5967 5968 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", 5969 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); 5970 5971 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5972 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5973 if (ret) 5974 goto free_adj; 5975 } 5976 5977 /* Ensure that master link is always the first item in list. */ 5978 if (master) { 5979 ret = sysfs_create_link(&(dev->dev.kobj), 5980 &(adj_dev->dev.kobj), "master"); 5981 if (ret) 5982 goto remove_symlinks; 5983 5984 list_add_rcu(&adj->list, dev_list); 5985 } else { 5986 list_add_tail_rcu(&adj->list, dev_list); 5987 } 5988 5989 return 0; 5990 5991 remove_symlinks: 5992 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5993 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5994 free_adj: 5995 kfree(adj); 5996 dev_put(adj_dev); 5997 5998 return ret; 5999 } 6000 6001 static void __netdev_adjacent_dev_remove(struct net_device *dev, 6002 struct net_device *adj_dev, 6003 u16 ref_nr, 6004 struct list_head *dev_list) 6005 { 6006 struct netdev_adjacent *adj; 6007 6008 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", 6009 dev->name, adj_dev->name, ref_nr); 6010 6011 adj = __netdev_find_adj(adj_dev, dev_list); 6012 6013 if (!adj) { 6014 pr_err("Adjacency does not exist for device %s from %s\n", 6015 dev->name, adj_dev->name); 6016 WARN_ON(1); 6017 return; 6018 } 6019 6020 if (adj->ref_nr > ref_nr) { 6021 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", 6022 dev->name, adj_dev->name, ref_nr, 6023 adj->ref_nr - ref_nr); 6024 adj->ref_nr -= ref_nr; 6025 return; 6026 } 6027 6028 if (adj->master) 6029 sysfs_remove_link(&(dev->dev.kobj), "master"); 6030 6031 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 6032 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 6033 6034 list_del_rcu(&adj->list); 6035 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", 6036 adj_dev->name, dev->name, adj_dev->name); 6037 dev_put(adj_dev); 6038 kfree_rcu(adj, rcu); 6039 } 6040 6041 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 6042 struct net_device *upper_dev, 6043 struct list_head *up_list, 6044 struct list_head *down_list, 6045 void *private, bool master) 6046 { 6047 int ret; 6048 6049 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, 6050 private, master); 6051 if (ret) 6052 return ret; 6053 6054 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, 6055 private, false); 6056 if (ret) { 6057 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); 6058 return ret; 6059 } 6060 6061 return 0; 6062 } 6063 6064 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 6065 struct net_device *upper_dev, 6066 u16 ref_nr, 6067 struct list_head *up_list, 6068 struct list_head *down_list) 6069 { 6070 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); 6071 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); 6072 } 6073 6074 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 6075 struct net_device *upper_dev, 6076 void *private, bool master) 6077 { 6078 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 6079 &dev->adj_list.upper, 6080 &upper_dev->adj_list.lower, 6081 private, master); 6082 } 6083 6084 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 6085 struct net_device *upper_dev) 6086 { 6087 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, 6088 &dev->adj_list.upper, 6089 &upper_dev->adj_list.lower); 6090 } 6091 6092 static int __netdev_upper_dev_link(struct net_device *dev, 6093 struct net_device *upper_dev, bool master, 6094 void *upper_priv, void *upper_info) 6095 { 6096 struct netdev_notifier_changeupper_info changeupper_info; 6097 int ret = 0; 6098 6099 ASSERT_RTNL(); 6100 6101 if (dev == upper_dev) 6102 return -EBUSY; 6103 6104 /* To prevent loops, check if dev is not upper device to upper_dev. */ 6105 if (netdev_has_upper_dev(upper_dev, dev)) 6106 return -EBUSY; 6107 6108 if (netdev_has_upper_dev(dev, upper_dev)) 6109 return -EEXIST; 6110 6111 if (master && netdev_master_upper_dev_get(dev)) 6112 return -EBUSY; 6113 6114 changeupper_info.upper_dev = upper_dev; 6115 changeupper_info.master = master; 6116 changeupper_info.linking = true; 6117 changeupper_info.upper_info = upper_info; 6118 6119 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 6120 &changeupper_info.info); 6121 ret = notifier_to_errno(ret); 6122 if (ret) 6123 return ret; 6124 6125 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, 6126 master); 6127 if (ret) 6128 return ret; 6129 6130 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 6131 &changeupper_info.info); 6132 ret = notifier_to_errno(ret); 6133 if (ret) 6134 goto rollback; 6135 6136 return 0; 6137 6138 rollback: 6139 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 6140 6141 return ret; 6142 } 6143 6144 /** 6145 * netdev_upper_dev_link - Add a link to the upper device 6146 * @dev: device 6147 * @upper_dev: new upper device 6148 * 6149 * Adds a link to device which is upper to this one. The caller must hold 6150 * the RTNL lock. On a failure a negative errno code is returned. 6151 * On success the reference counts are adjusted and the function 6152 * returns zero. 6153 */ 6154 int netdev_upper_dev_link(struct net_device *dev, 6155 struct net_device *upper_dev) 6156 { 6157 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); 6158 } 6159 EXPORT_SYMBOL(netdev_upper_dev_link); 6160 6161 /** 6162 * netdev_master_upper_dev_link - Add a master link to the upper device 6163 * @dev: device 6164 * @upper_dev: new upper device 6165 * @upper_priv: upper device private 6166 * @upper_info: upper info to be passed down via notifier 6167 * 6168 * Adds a link to device which is upper to this one. In this case, only 6169 * one master upper device can be linked, although other non-master devices 6170 * might be linked as well. The caller must hold the RTNL lock. 6171 * On a failure a negative errno code is returned. On success the reference 6172 * counts are adjusted and the function returns zero. 6173 */ 6174 int netdev_master_upper_dev_link(struct net_device *dev, 6175 struct net_device *upper_dev, 6176 void *upper_priv, void *upper_info) 6177 { 6178 return __netdev_upper_dev_link(dev, upper_dev, true, 6179 upper_priv, upper_info); 6180 } 6181 EXPORT_SYMBOL(netdev_master_upper_dev_link); 6182 6183 /** 6184 * netdev_upper_dev_unlink - Removes a link to upper device 6185 * @dev: device 6186 * @upper_dev: new upper device 6187 * 6188 * Removes a link to device which is upper to this one. The caller must hold 6189 * the RTNL lock. 6190 */ 6191 void netdev_upper_dev_unlink(struct net_device *dev, 6192 struct net_device *upper_dev) 6193 { 6194 struct netdev_notifier_changeupper_info changeupper_info; 6195 6196 ASSERT_RTNL(); 6197 6198 changeupper_info.upper_dev = upper_dev; 6199 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; 6200 changeupper_info.linking = false; 6201 6202 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 6203 &changeupper_info.info); 6204 6205 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 6206 6207 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 6208 &changeupper_info.info); 6209 } 6210 EXPORT_SYMBOL(netdev_upper_dev_unlink); 6211 6212 /** 6213 * netdev_bonding_info_change - Dispatch event about slave change 6214 * @dev: device 6215 * @bonding_info: info to dispatch 6216 * 6217 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 6218 * The caller must hold the RTNL lock. 6219 */ 6220 void netdev_bonding_info_change(struct net_device *dev, 6221 struct netdev_bonding_info *bonding_info) 6222 { 6223 struct netdev_notifier_bonding_info info; 6224 6225 memcpy(&info.bonding_info, bonding_info, 6226 sizeof(struct netdev_bonding_info)); 6227 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 6228 &info.info); 6229 } 6230 EXPORT_SYMBOL(netdev_bonding_info_change); 6231 6232 static void netdev_adjacent_add_links(struct net_device *dev) 6233 { 6234 struct netdev_adjacent *iter; 6235 6236 struct net *net = dev_net(dev); 6237 6238 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6239 if (!net_eq(net, dev_net(iter->dev))) 6240 continue; 6241 netdev_adjacent_sysfs_add(iter->dev, dev, 6242 &iter->dev->adj_list.lower); 6243 netdev_adjacent_sysfs_add(dev, iter->dev, 6244 &dev->adj_list.upper); 6245 } 6246 6247 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6248 if (!net_eq(net, dev_net(iter->dev))) 6249 continue; 6250 netdev_adjacent_sysfs_add(iter->dev, dev, 6251 &iter->dev->adj_list.upper); 6252 netdev_adjacent_sysfs_add(dev, iter->dev, 6253 &dev->adj_list.lower); 6254 } 6255 } 6256 6257 static void netdev_adjacent_del_links(struct net_device *dev) 6258 { 6259 struct netdev_adjacent *iter; 6260 6261 struct net *net = dev_net(dev); 6262 6263 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6264 if (!net_eq(net, dev_net(iter->dev))) 6265 continue; 6266 netdev_adjacent_sysfs_del(iter->dev, dev->name, 6267 &iter->dev->adj_list.lower); 6268 netdev_adjacent_sysfs_del(dev, iter->dev->name, 6269 &dev->adj_list.upper); 6270 } 6271 6272 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6273 if (!net_eq(net, dev_net(iter->dev))) 6274 continue; 6275 netdev_adjacent_sysfs_del(iter->dev, dev->name, 6276 &iter->dev->adj_list.upper); 6277 netdev_adjacent_sysfs_del(dev, iter->dev->name, 6278 &dev->adj_list.lower); 6279 } 6280 } 6281 6282 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 6283 { 6284 struct netdev_adjacent *iter; 6285 6286 struct net *net = dev_net(dev); 6287 6288 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6289 if (!net_eq(net, dev_net(iter->dev))) 6290 continue; 6291 netdev_adjacent_sysfs_del(iter->dev, oldname, 6292 &iter->dev->adj_list.lower); 6293 netdev_adjacent_sysfs_add(iter->dev, dev, 6294 &iter->dev->adj_list.lower); 6295 } 6296 6297 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6298 if (!net_eq(net, dev_net(iter->dev))) 6299 continue; 6300 netdev_adjacent_sysfs_del(iter->dev, oldname, 6301 &iter->dev->adj_list.upper); 6302 netdev_adjacent_sysfs_add(iter->dev, dev, 6303 &iter->dev->adj_list.upper); 6304 } 6305 } 6306 6307 void *netdev_lower_dev_get_private(struct net_device *dev, 6308 struct net_device *lower_dev) 6309 { 6310 struct netdev_adjacent *lower; 6311 6312 if (!lower_dev) 6313 return NULL; 6314 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); 6315 if (!lower) 6316 return NULL; 6317 6318 return lower->private; 6319 } 6320 EXPORT_SYMBOL(netdev_lower_dev_get_private); 6321 6322 6323 int dev_get_nest_level(struct net_device *dev) 6324 { 6325 struct net_device *lower = NULL; 6326 struct list_head *iter; 6327 int max_nest = -1; 6328 int nest; 6329 6330 ASSERT_RTNL(); 6331 6332 netdev_for_each_lower_dev(dev, lower, iter) { 6333 nest = dev_get_nest_level(lower); 6334 if (max_nest < nest) 6335 max_nest = nest; 6336 } 6337 6338 return max_nest + 1; 6339 } 6340 EXPORT_SYMBOL(dev_get_nest_level); 6341 6342 /** 6343 * netdev_lower_change - Dispatch event about lower device state change 6344 * @lower_dev: device 6345 * @lower_state_info: state to dispatch 6346 * 6347 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. 6348 * The caller must hold the RTNL lock. 6349 */ 6350 void netdev_lower_state_changed(struct net_device *lower_dev, 6351 void *lower_state_info) 6352 { 6353 struct netdev_notifier_changelowerstate_info changelowerstate_info; 6354 6355 ASSERT_RTNL(); 6356 changelowerstate_info.lower_state_info = lower_state_info; 6357 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, 6358 &changelowerstate_info.info); 6359 } 6360 EXPORT_SYMBOL(netdev_lower_state_changed); 6361 6362 static void dev_change_rx_flags(struct net_device *dev, int flags) 6363 { 6364 const struct net_device_ops *ops = dev->netdev_ops; 6365 6366 if (ops->ndo_change_rx_flags) 6367 ops->ndo_change_rx_flags(dev, flags); 6368 } 6369 6370 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 6371 { 6372 unsigned int old_flags = dev->flags; 6373 kuid_t uid; 6374 kgid_t gid; 6375 6376 ASSERT_RTNL(); 6377 6378 dev->flags |= IFF_PROMISC; 6379 dev->promiscuity += inc; 6380 if (dev->promiscuity == 0) { 6381 /* 6382 * Avoid overflow. 6383 * If inc causes overflow, untouch promisc and return error. 6384 */ 6385 if (inc < 0) 6386 dev->flags &= ~IFF_PROMISC; 6387 else { 6388 dev->promiscuity -= inc; 6389 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 6390 dev->name); 6391 return -EOVERFLOW; 6392 } 6393 } 6394 if (dev->flags != old_flags) { 6395 pr_info("device %s %s promiscuous mode\n", 6396 dev->name, 6397 dev->flags & IFF_PROMISC ? "entered" : "left"); 6398 if (audit_enabled) { 6399 current_uid_gid(&uid, &gid); 6400 audit_log(current->audit_context, GFP_ATOMIC, 6401 AUDIT_ANOM_PROMISCUOUS, 6402 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 6403 dev->name, (dev->flags & IFF_PROMISC), 6404 (old_flags & IFF_PROMISC), 6405 from_kuid(&init_user_ns, audit_get_loginuid(current)), 6406 from_kuid(&init_user_ns, uid), 6407 from_kgid(&init_user_ns, gid), 6408 audit_get_sessionid(current)); 6409 } 6410 6411 dev_change_rx_flags(dev, IFF_PROMISC); 6412 } 6413 if (notify) 6414 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 6415 return 0; 6416 } 6417 6418 /** 6419 * dev_set_promiscuity - update promiscuity count on a device 6420 * @dev: device 6421 * @inc: modifier 6422 * 6423 * Add or remove promiscuity from a device. While the count in the device 6424 * remains above zero the interface remains promiscuous. Once it hits zero 6425 * the device reverts back to normal filtering operation. A negative inc 6426 * value is used to drop promiscuity on the device. 6427 * Return 0 if successful or a negative errno code on error. 6428 */ 6429 int dev_set_promiscuity(struct net_device *dev, int inc) 6430 { 6431 unsigned int old_flags = dev->flags; 6432 int err; 6433 6434 err = __dev_set_promiscuity(dev, inc, true); 6435 if (err < 0) 6436 return err; 6437 if (dev->flags != old_flags) 6438 dev_set_rx_mode(dev); 6439 return err; 6440 } 6441 EXPORT_SYMBOL(dev_set_promiscuity); 6442 6443 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 6444 { 6445 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 6446 6447 ASSERT_RTNL(); 6448 6449 dev->flags |= IFF_ALLMULTI; 6450 dev->allmulti += inc; 6451 if (dev->allmulti == 0) { 6452 /* 6453 * Avoid overflow. 6454 * If inc causes overflow, untouch allmulti and return error. 6455 */ 6456 if (inc < 0) 6457 dev->flags &= ~IFF_ALLMULTI; 6458 else { 6459 dev->allmulti -= inc; 6460 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 6461 dev->name); 6462 return -EOVERFLOW; 6463 } 6464 } 6465 if (dev->flags ^ old_flags) { 6466 dev_change_rx_flags(dev, IFF_ALLMULTI); 6467 dev_set_rx_mode(dev); 6468 if (notify) 6469 __dev_notify_flags(dev, old_flags, 6470 dev->gflags ^ old_gflags); 6471 } 6472 return 0; 6473 } 6474 6475 /** 6476 * dev_set_allmulti - update allmulti count on a device 6477 * @dev: device 6478 * @inc: modifier 6479 * 6480 * Add or remove reception of all multicast frames to a device. While the 6481 * count in the device remains above zero the interface remains listening 6482 * to all interfaces. Once it hits zero the device reverts back to normal 6483 * filtering operation. A negative @inc value is used to drop the counter 6484 * when releasing a resource needing all multicasts. 6485 * Return 0 if successful or a negative errno code on error. 6486 */ 6487 6488 int dev_set_allmulti(struct net_device *dev, int inc) 6489 { 6490 return __dev_set_allmulti(dev, inc, true); 6491 } 6492 EXPORT_SYMBOL(dev_set_allmulti); 6493 6494 /* 6495 * Upload unicast and multicast address lists to device and 6496 * configure RX filtering. When the device doesn't support unicast 6497 * filtering it is put in promiscuous mode while unicast addresses 6498 * are present. 6499 */ 6500 void __dev_set_rx_mode(struct net_device *dev) 6501 { 6502 const struct net_device_ops *ops = dev->netdev_ops; 6503 6504 /* dev_open will call this function so the list will stay sane. */ 6505 if (!(dev->flags&IFF_UP)) 6506 return; 6507 6508 if (!netif_device_present(dev)) 6509 return; 6510 6511 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 6512 /* Unicast addresses changes may only happen under the rtnl, 6513 * therefore calling __dev_set_promiscuity here is safe. 6514 */ 6515 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 6516 __dev_set_promiscuity(dev, 1, false); 6517 dev->uc_promisc = true; 6518 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 6519 __dev_set_promiscuity(dev, -1, false); 6520 dev->uc_promisc = false; 6521 } 6522 } 6523 6524 if (ops->ndo_set_rx_mode) 6525 ops->ndo_set_rx_mode(dev); 6526 } 6527 6528 void dev_set_rx_mode(struct net_device *dev) 6529 { 6530 netif_addr_lock_bh(dev); 6531 __dev_set_rx_mode(dev); 6532 netif_addr_unlock_bh(dev); 6533 } 6534 6535 /** 6536 * dev_get_flags - get flags reported to userspace 6537 * @dev: device 6538 * 6539 * Get the combination of flag bits exported through APIs to userspace. 6540 */ 6541 unsigned int dev_get_flags(const struct net_device *dev) 6542 { 6543 unsigned int flags; 6544 6545 flags = (dev->flags & ~(IFF_PROMISC | 6546 IFF_ALLMULTI | 6547 IFF_RUNNING | 6548 IFF_LOWER_UP | 6549 IFF_DORMANT)) | 6550 (dev->gflags & (IFF_PROMISC | 6551 IFF_ALLMULTI)); 6552 6553 if (netif_running(dev)) { 6554 if (netif_oper_up(dev)) 6555 flags |= IFF_RUNNING; 6556 if (netif_carrier_ok(dev)) 6557 flags |= IFF_LOWER_UP; 6558 if (netif_dormant(dev)) 6559 flags |= IFF_DORMANT; 6560 } 6561 6562 return flags; 6563 } 6564 EXPORT_SYMBOL(dev_get_flags); 6565 6566 int __dev_change_flags(struct net_device *dev, unsigned int flags) 6567 { 6568 unsigned int old_flags = dev->flags; 6569 int ret; 6570 6571 ASSERT_RTNL(); 6572 6573 /* 6574 * Set the flags on our device. 6575 */ 6576 6577 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 6578 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 6579 IFF_AUTOMEDIA)) | 6580 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 6581 IFF_ALLMULTI)); 6582 6583 /* 6584 * Load in the correct multicast list now the flags have changed. 6585 */ 6586 6587 if ((old_flags ^ flags) & IFF_MULTICAST) 6588 dev_change_rx_flags(dev, IFF_MULTICAST); 6589 6590 dev_set_rx_mode(dev); 6591 6592 /* 6593 * Have we downed the interface. We handle IFF_UP ourselves 6594 * according to user attempts to set it, rather than blindly 6595 * setting it. 6596 */ 6597 6598 ret = 0; 6599 if ((old_flags ^ flags) & IFF_UP) 6600 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 6601 6602 if ((flags ^ dev->gflags) & IFF_PROMISC) { 6603 int inc = (flags & IFF_PROMISC) ? 1 : -1; 6604 unsigned int old_flags = dev->flags; 6605 6606 dev->gflags ^= IFF_PROMISC; 6607 6608 if (__dev_set_promiscuity(dev, inc, false) >= 0) 6609 if (dev->flags != old_flags) 6610 dev_set_rx_mode(dev); 6611 } 6612 6613 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 6614 * is important. Some (broken) drivers set IFF_PROMISC, when 6615 * IFF_ALLMULTI is requested not asking us and not reporting. 6616 */ 6617 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 6618 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 6619 6620 dev->gflags ^= IFF_ALLMULTI; 6621 __dev_set_allmulti(dev, inc, false); 6622 } 6623 6624 return ret; 6625 } 6626 6627 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 6628 unsigned int gchanges) 6629 { 6630 unsigned int changes = dev->flags ^ old_flags; 6631 6632 if (gchanges) 6633 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 6634 6635 if (changes & IFF_UP) { 6636 if (dev->flags & IFF_UP) 6637 call_netdevice_notifiers(NETDEV_UP, dev); 6638 else 6639 call_netdevice_notifiers(NETDEV_DOWN, dev); 6640 } 6641 6642 if (dev->flags & IFF_UP && 6643 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 6644 struct netdev_notifier_change_info change_info; 6645 6646 change_info.flags_changed = changes; 6647 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 6648 &change_info.info); 6649 } 6650 } 6651 6652 /** 6653 * dev_change_flags - change device settings 6654 * @dev: device 6655 * @flags: device state flags 6656 * 6657 * Change settings on device based state flags. The flags are 6658 * in the userspace exported format. 6659 */ 6660 int dev_change_flags(struct net_device *dev, unsigned int flags) 6661 { 6662 int ret; 6663 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 6664 6665 ret = __dev_change_flags(dev, flags); 6666 if (ret < 0) 6667 return ret; 6668 6669 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 6670 __dev_notify_flags(dev, old_flags, changes); 6671 return ret; 6672 } 6673 EXPORT_SYMBOL(dev_change_flags); 6674 6675 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 6676 { 6677 const struct net_device_ops *ops = dev->netdev_ops; 6678 6679 if (ops->ndo_change_mtu) 6680 return ops->ndo_change_mtu(dev, new_mtu); 6681 6682 dev->mtu = new_mtu; 6683 return 0; 6684 } 6685 6686 /** 6687 * dev_set_mtu - Change maximum transfer unit 6688 * @dev: device 6689 * @new_mtu: new transfer unit 6690 * 6691 * Change the maximum transfer size of the network device. 6692 */ 6693 int dev_set_mtu(struct net_device *dev, int new_mtu) 6694 { 6695 int err, orig_mtu; 6696 6697 if (new_mtu == dev->mtu) 6698 return 0; 6699 6700 /* MTU must be positive, and in range */ 6701 if (new_mtu < 0 || new_mtu < dev->min_mtu) { 6702 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n", 6703 dev->name, new_mtu, dev->min_mtu); 6704 return -EINVAL; 6705 } 6706 6707 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { 6708 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n", 6709 dev->name, new_mtu, dev->max_mtu); 6710 return -EINVAL; 6711 } 6712 6713 if (!netif_device_present(dev)) 6714 return -ENODEV; 6715 6716 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 6717 err = notifier_to_errno(err); 6718 if (err) 6719 return err; 6720 6721 orig_mtu = dev->mtu; 6722 err = __dev_set_mtu(dev, new_mtu); 6723 6724 if (!err) { 6725 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6726 err = notifier_to_errno(err); 6727 if (err) { 6728 /* setting mtu back and notifying everyone again, 6729 * so that they have a chance to revert changes. 6730 */ 6731 __dev_set_mtu(dev, orig_mtu); 6732 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6733 } 6734 } 6735 return err; 6736 } 6737 EXPORT_SYMBOL(dev_set_mtu); 6738 6739 /** 6740 * dev_set_group - Change group this device belongs to 6741 * @dev: device 6742 * @new_group: group this device should belong to 6743 */ 6744 void dev_set_group(struct net_device *dev, int new_group) 6745 { 6746 dev->group = new_group; 6747 } 6748 EXPORT_SYMBOL(dev_set_group); 6749 6750 /** 6751 * dev_set_mac_address - Change Media Access Control Address 6752 * @dev: device 6753 * @sa: new address 6754 * 6755 * Change the hardware (MAC) address of the device 6756 */ 6757 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 6758 { 6759 const struct net_device_ops *ops = dev->netdev_ops; 6760 int err; 6761 6762 if (!ops->ndo_set_mac_address) 6763 return -EOPNOTSUPP; 6764 if (sa->sa_family != dev->type) 6765 return -EINVAL; 6766 if (!netif_device_present(dev)) 6767 return -ENODEV; 6768 err = ops->ndo_set_mac_address(dev, sa); 6769 if (err) 6770 return err; 6771 dev->addr_assign_type = NET_ADDR_SET; 6772 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 6773 add_device_randomness(dev->dev_addr, dev->addr_len); 6774 return 0; 6775 } 6776 EXPORT_SYMBOL(dev_set_mac_address); 6777 6778 /** 6779 * dev_change_carrier - Change device carrier 6780 * @dev: device 6781 * @new_carrier: new value 6782 * 6783 * Change device carrier 6784 */ 6785 int dev_change_carrier(struct net_device *dev, bool new_carrier) 6786 { 6787 const struct net_device_ops *ops = dev->netdev_ops; 6788 6789 if (!ops->ndo_change_carrier) 6790 return -EOPNOTSUPP; 6791 if (!netif_device_present(dev)) 6792 return -ENODEV; 6793 return ops->ndo_change_carrier(dev, new_carrier); 6794 } 6795 EXPORT_SYMBOL(dev_change_carrier); 6796 6797 /** 6798 * dev_get_phys_port_id - Get device physical port ID 6799 * @dev: device 6800 * @ppid: port ID 6801 * 6802 * Get device physical port ID 6803 */ 6804 int dev_get_phys_port_id(struct net_device *dev, 6805 struct netdev_phys_item_id *ppid) 6806 { 6807 const struct net_device_ops *ops = dev->netdev_ops; 6808 6809 if (!ops->ndo_get_phys_port_id) 6810 return -EOPNOTSUPP; 6811 return ops->ndo_get_phys_port_id(dev, ppid); 6812 } 6813 EXPORT_SYMBOL(dev_get_phys_port_id); 6814 6815 /** 6816 * dev_get_phys_port_name - Get device physical port name 6817 * @dev: device 6818 * @name: port name 6819 * @len: limit of bytes to copy to name 6820 * 6821 * Get device physical port name 6822 */ 6823 int dev_get_phys_port_name(struct net_device *dev, 6824 char *name, size_t len) 6825 { 6826 const struct net_device_ops *ops = dev->netdev_ops; 6827 6828 if (!ops->ndo_get_phys_port_name) 6829 return -EOPNOTSUPP; 6830 return ops->ndo_get_phys_port_name(dev, name, len); 6831 } 6832 EXPORT_SYMBOL(dev_get_phys_port_name); 6833 6834 /** 6835 * dev_change_proto_down - update protocol port state information 6836 * @dev: device 6837 * @proto_down: new value 6838 * 6839 * This info can be used by switch drivers to set the phys state of the 6840 * port. 6841 */ 6842 int dev_change_proto_down(struct net_device *dev, bool proto_down) 6843 { 6844 const struct net_device_ops *ops = dev->netdev_ops; 6845 6846 if (!ops->ndo_change_proto_down) 6847 return -EOPNOTSUPP; 6848 if (!netif_device_present(dev)) 6849 return -ENODEV; 6850 return ops->ndo_change_proto_down(dev, proto_down); 6851 } 6852 EXPORT_SYMBOL(dev_change_proto_down); 6853 6854 /** 6855 * dev_change_xdp_fd - set or clear a bpf program for a device rx path 6856 * @dev: device 6857 * @extack: netlink extended ack 6858 * @fd: new program fd or negative value to clear 6859 * @flags: xdp-related flags 6860 * 6861 * Set or clear a bpf program for a device 6862 */ 6863 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, 6864 int fd, u32 flags) 6865 { 6866 int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp); 6867 const struct net_device_ops *ops = dev->netdev_ops; 6868 struct bpf_prog *prog = NULL; 6869 struct netdev_xdp xdp; 6870 int err; 6871 6872 ASSERT_RTNL(); 6873 6874 xdp_op = ops->ndo_xdp; 6875 if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) 6876 xdp_op = generic_xdp_install; 6877 6878 if (fd >= 0) { 6879 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { 6880 memset(&xdp, 0, sizeof(xdp)); 6881 xdp.command = XDP_QUERY_PROG; 6882 6883 err = xdp_op(dev, &xdp); 6884 if (err < 0) 6885 return err; 6886 if (xdp.prog_attached) 6887 return -EBUSY; 6888 } 6889 6890 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); 6891 if (IS_ERR(prog)) 6892 return PTR_ERR(prog); 6893 } 6894 6895 memset(&xdp, 0, sizeof(xdp)); 6896 xdp.command = XDP_SETUP_PROG; 6897 xdp.extack = extack; 6898 xdp.prog = prog; 6899 6900 err = xdp_op(dev, &xdp); 6901 if (err < 0 && prog) 6902 bpf_prog_put(prog); 6903 6904 return err; 6905 } 6906 6907 /** 6908 * dev_new_index - allocate an ifindex 6909 * @net: the applicable net namespace 6910 * 6911 * Returns a suitable unique value for a new device interface 6912 * number. The caller must hold the rtnl semaphore or the 6913 * dev_base_lock to be sure it remains unique. 6914 */ 6915 static int dev_new_index(struct net *net) 6916 { 6917 int ifindex = net->ifindex; 6918 6919 for (;;) { 6920 if (++ifindex <= 0) 6921 ifindex = 1; 6922 if (!__dev_get_by_index(net, ifindex)) 6923 return net->ifindex = ifindex; 6924 } 6925 } 6926 6927 /* Delayed registration/unregisteration */ 6928 static LIST_HEAD(net_todo_list); 6929 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 6930 6931 static void net_set_todo(struct net_device *dev) 6932 { 6933 list_add_tail(&dev->todo_list, &net_todo_list); 6934 dev_net(dev)->dev_unreg_count++; 6935 } 6936 6937 static void rollback_registered_many(struct list_head *head) 6938 { 6939 struct net_device *dev, *tmp; 6940 LIST_HEAD(close_head); 6941 6942 BUG_ON(dev_boot_phase); 6943 ASSERT_RTNL(); 6944 6945 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 6946 /* Some devices call without registering 6947 * for initialization unwind. Remove those 6948 * devices and proceed with the remaining. 6949 */ 6950 if (dev->reg_state == NETREG_UNINITIALIZED) { 6951 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 6952 dev->name, dev); 6953 6954 WARN_ON(1); 6955 list_del(&dev->unreg_list); 6956 continue; 6957 } 6958 dev->dismantle = true; 6959 BUG_ON(dev->reg_state != NETREG_REGISTERED); 6960 } 6961 6962 /* If device is running, close it first. */ 6963 list_for_each_entry(dev, head, unreg_list) 6964 list_add_tail(&dev->close_list, &close_head); 6965 dev_close_many(&close_head, true); 6966 6967 list_for_each_entry(dev, head, unreg_list) { 6968 /* And unlink it from device chain. */ 6969 unlist_netdevice(dev); 6970 6971 dev->reg_state = NETREG_UNREGISTERING; 6972 } 6973 flush_all_backlogs(); 6974 6975 synchronize_net(); 6976 6977 list_for_each_entry(dev, head, unreg_list) { 6978 struct sk_buff *skb = NULL; 6979 6980 /* Shutdown queueing discipline. */ 6981 dev_shutdown(dev); 6982 6983 6984 /* Notify protocols, that we are about to destroy 6985 * this device. They should clean all the things. 6986 */ 6987 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6988 6989 if (!dev->rtnl_link_ops || 6990 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6991 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 6992 GFP_KERNEL); 6993 6994 /* 6995 * Flush the unicast and multicast chains 6996 */ 6997 dev_uc_flush(dev); 6998 dev_mc_flush(dev); 6999 7000 if (dev->netdev_ops->ndo_uninit) 7001 dev->netdev_ops->ndo_uninit(dev); 7002 7003 if (skb) 7004 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 7005 7006 /* Notifier chain MUST detach us all upper devices. */ 7007 WARN_ON(netdev_has_any_upper_dev(dev)); 7008 WARN_ON(netdev_has_any_lower_dev(dev)); 7009 7010 /* Remove entries from kobject tree */ 7011 netdev_unregister_kobject(dev); 7012 #ifdef CONFIG_XPS 7013 /* Remove XPS queueing entries */ 7014 netif_reset_xps_queues_gt(dev, 0); 7015 #endif 7016 } 7017 7018 synchronize_net(); 7019 7020 list_for_each_entry(dev, head, unreg_list) 7021 dev_put(dev); 7022 } 7023 7024 static void rollback_registered(struct net_device *dev) 7025 { 7026 LIST_HEAD(single); 7027 7028 list_add(&dev->unreg_list, &single); 7029 rollback_registered_many(&single); 7030 list_del(&single); 7031 } 7032 7033 static netdev_features_t netdev_sync_upper_features(struct net_device *lower, 7034 struct net_device *upper, netdev_features_t features) 7035 { 7036 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 7037 netdev_features_t feature; 7038 int feature_bit; 7039 7040 for_each_netdev_feature(&upper_disables, feature_bit) { 7041 feature = __NETIF_F_BIT(feature_bit); 7042 if (!(upper->wanted_features & feature) 7043 && (features & feature)) { 7044 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", 7045 &feature, upper->name); 7046 features &= ~feature; 7047 } 7048 } 7049 7050 return features; 7051 } 7052 7053 static void netdev_sync_lower_features(struct net_device *upper, 7054 struct net_device *lower, netdev_features_t features) 7055 { 7056 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 7057 netdev_features_t feature; 7058 int feature_bit; 7059 7060 for_each_netdev_feature(&upper_disables, feature_bit) { 7061 feature = __NETIF_F_BIT(feature_bit); 7062 if (!(features & feature) && (lower->features & feature)) { 7063 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", 7064 &feature, lower->name); 7065 lower->wanted_features &= ~feature; 7066 netdev_update_features(lower); 7067 7068 if (unlikely(lower->features & feature)) 7069 netdev_WARN(upper, "failed to disable %pNF on %s!\n", 7070 &feature, lower->name); 7071 } 7072 } 7073 } 7074 7075 static netdev_features_t netdev_fix_features(struct net_device *dev, 7076 netdev_features_t features) 7077 { 7078 /* Fix illegal checksum combinations */ 7079 if ((features & NETIF_F_HW_CSUM) && 7080 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 7081 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 7082 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 7083 } 7084 7085 /* TSO requires that SG is present as well. */ 7086 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 7087 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 7088 features &= ~NETIF_F_ALL_TSO; 7089 } 7090 7091 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 7092 !(features & NETIF_F_IP_CSUM)) { 7093 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 7094 features &= ~NETIF_F_TSO; 7095 features &= ~NETIF_F_TSO_ECN; 7096 } 7097 7098 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 7099 !(features & NETIF_F_IPV6_CSUM)) { 7100 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 7101 features &= ~NETIF_F_TSO6; 7102 } 7103 7104 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ 7105 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) 7106 features &= ~NETIF_F_TSO_MANGLEID; 7107 7108 /* TSO ECN requires that TSO is present as well. */ 7109 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 7110 features &= ~NETIF_F_TSO_ECN; 7111 7112 /* Software GSO depends on SG. */ 7113 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 7114 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 7115 features &= ~NETIF_F_GSO; 7116 } 7117 7118 /* UFO needs SG and checksumming */ 7119 if (features & NETIF_F_UFO) { 7120 /* maybe split UFO into V4 and V6? */ 7121 if (!(features & NETIF_F_HW_CSUM) && 7122 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != 7123 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { 7124 netdev_dbg(dev, 7125 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 7126 features &= ~NETIF_F_UFO; 7127 } 7128 7129 if (!(features & NETIF_F_SG)) { 7130 netdev_dbg(dev, 7131 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 7132 features &= ~NETIF_F_UFO; 7133 } 7134 } 7135 7136 /* GSO partial features require GSO partial be set */ 7137 if ((features & dev->gso_partial_features) && 7138 !(features & NETIF_F_GSO_PARTIAL)) { 7139 netdev_dbg(dev, 7140 "Dropping partially supported GSO features since no GSO partial.\n"); 7141 features &= ~dev->gso_partial_features; 7142 } 7143 7144 return features; 7145 } 7146 7147 int __netdev_update_features(struct net_device *dev) 7148 { 7149 struct net_device *upper, *lower; 7150 netdev_features_t features; 7151 struct list_head *iter; 7152 int err = -1; 7153 7154 ASSERT_RTNL(); 7155 7156 features = netdev_get_wanted_features(dev); 7157 7158 if (dev->netdev_ops->ndo_fix_features) 7159 features = dev->netdev_ops->ndo_fix_features(dev, features); 7160 7161 /* driver might be less strict about feature dependencies */ 7162 features = netdev_fix_features(dev, features); 7163 7164 /* some features can't be enabled if they're off an an upper device */ 7165 netdev_for_each_upper_dev_rcu(dev, upper, iter) 7166 features = netdev_sync_upper_features(dev, upper, features); 7167 7168 if (dev->features == features) 7169 goto sync_lower; 7170 7171 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 7172 &dev->features, &features); 7173 7174 if (dev->netdev_ops->ndo_set_features) 7175 err = dev->netdev_ops->ndo_set_features(dev, features); 7176 else 7177 err = 0; 7178 7179 if (unlikely(err < 0)) { 7180 netdev_err(dev, 7181 "set_features() failed (%d); wanted %pNF, left %pNF\n", 7182 err, &features, &dev->features); 7183 /* return non-0 since some features might have changed and 7184 * it's better to fire a spurious notification than miss it 7185 */ 7186 return -1; 7187 } 7188 7189 sync_lower: 7190 /* some features must be disabled on lower devices when disabled 7191 * on an upper device (think: bonding master or bridge) 7192 */ 7193 netdev_for_each_lower_dev(dev, lower, iter) 7194 netdev_sync_lower_features(dev, lower, features); 7195 7196 if (!err) 7197 dev->features = features; 7198 7199 return err < 0 ? 0 : 1; 7200 } 7201 7202 /** 7203 * netdev_update_features - recalculate device features 7204 * @dev: the device to check 7205 * 7206 * Recalculate dev->features set and send notifications if it 7207 * has changed. Should be called after driver or hardware dependent 7208 * conditions might have changed that influence the features. 7209 */ 7210 void netdev_update_features(struct net_device *dev) 7211 { 7212 if (__netdev_update_features(dev)) 7213 netdev_features_change(dev); 7214 } 7215 EXPORT_SYMBOL(netdev_update_features); 7216 7217 /** 7218 * netdev_change_features - recalculate device features 7219 * @dev: the device to check 7220 * 7221 * Recalculate dev->features set and send notifications even 7222 * if they have not changed. Should be called instead of 7223 * netdev_update_features() if also dev->vlan_features might 7224 * have changed to allow the changes to be propagated to stacked 7225 * VLAN devices. 7226 */ 7227 void netdev_change_features(struct net_device *dev) 7228 { 7229 __netdev_update_features(dev); 7230 netdev_features_change(dev); 7231 } 7232 EXPORT_SYMBOL(netdev_change_features); 7233 7234 /** 7235 * netif_stacked_transfer_operstate - transfer operstate 7236 * @rootdev: the root or lower level device to transfer state from 7237 * @dev: the device to transfer operstate to 7238 * 7239 * Transfer operational state from root to device. This is normally 7240 * called when a stacking relationship exists between the root 7241 * device and the device(a leaf device). 7242 */ 7243 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 7244 struct net_device *dev) 7245 { 7246 if (rootdev->operstate == IF_OPER_DORMANT) 7247 netif_dormant_on(dev); 7248 else 7249 netif_dormant_off(dev); 7250 7251 if (netif_carrier_ok(rootdev)) 7252 netif_carrier_on(dev); 7253 else 7254 netif_carrier_off(dev); 7255 } 7256 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 7257 7258 #ifdef CONFIG_SYSFS 7259 static int netif_alloc_rx_queues(struct net_device *dev) 7260 { 7261 unsigned int i, count = dev->num_rx_queues; 7262 struct netdev_rx_queue *rx; 7263 size_t sz = count * sizeof(*rx); 7264 7265 BUG_ON(count < 1); 7266 7267 rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); 7268 if (!rx) 7269 return -ENOMEM; 7270 7271 dev->_rx = rx; 7272 7273 for (i = 0; i < count; i++) 7274 rx[i].dev = dev; 7275 return 0; 7276 } 7277 #endif 7278 7279 static void netdev_init_one_queue(struct net_device *dev, 7280 struct netdev_queue *queue, void *_unused) 7281 { 7282 /* Initialize queue lock */ 7283 spin_lock_init(&queue->_xmit_lock); 7284 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 7285 queue->xmit_lock_owner = -1; 7286 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 7287 queue->dev = dev; 7288 #ifdef CONFIG_BQL 7289 dql_init(&queue->dql, HZ); 7290 #endif 7291 } 7292 7293 static void netif_free_tx_queues(struct net_device *dev) 7294 { 7295 kvfree(dev->_tx); 7296 } 7297 7298 static int netif_alloc_netdev_queues(struct net_device *dev) 7299 { 7300 unsigned int count = dev->num_tx_queues; 7301 struct netdev_queue *tx; 7302 size_t sz = count * sizeof(*tx); 7303 7304 if (count < 1 || count > 0xffff) 7305 return -EINVAL; 7306 7307 tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); 7308 if (!tx) 7309 return -ENOMEM; 7310 7311 dev->_tx = tx; 7312 7313 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 7314 spin_lock_init(&dev->tx_global_lock); 7315 7316 return 0; 7317 } 7318 7319 void netif_tx_stop_all_queues(struct net_device *dev) 7320 { 7321 unsigned int i; 7322 7323 for (i = 0; i < dev->num_tx_queues; i++) { 7324 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 7325 7326 netif_tx_stop_queue(txq); 7327 } 7328 } 7329 EXPORT_SYMBOL(netif_tx_stop_all_queues); 7330 7331 /** 7332 * register_netdevice - register a network device 7333 * @dev: device to register 7334 * 7335 * Take a completed network device structure and add it to the kernel 7336 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 7337 * chain. 0 is returned on success. A negative errno code is returned 7338 * on a failure to set up the device, or if the name is a duplicate. 7339 * 7340 * Callers must hold the rtnl semaphore. You may want 7341 * register_netdev() instead of this. 7342 * 7343 * BUGS: 7344 * The locking appears insufficient to guarantee two parallel registers 7345 * will not get the same name. 7346 */ 7347 7348 int register_netdevice(struct net_device *dev) 7349 { 7350 int ret; 7351 struct net *net = dev_net(dev); 7352 7353 BUG_ON(dev_boot_phase); 7354 ASSERT_RTNL(); 7355 7356 might_sleep(); 7357 7358 /* When net_device's are persistent, this will be fatal. */ 7359 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 7360 BUG_ON(!net); 7361 7362 spin_lock_init(&dev->addr_list_lock); 7363 netdev_set_addr_lockdep_class(dev); 7364 7365 ret = dev_get_valid_name(net, dev, dev->name); 7366 if (ret < 0) 7367 goto out; 7368 7369 /* Init, if this function is available */ 7370 if (dev->netdev_ops->ndo_init) { 7371 ret = dev->netdev_ops->ndo_init(dev); 7372 if (ret) { 7373 if (ret > 0) 7374 ret = -EIO; 7375 goto out; 7376 } 7377 } 7378 7379 if (((dev->hw_features | dev->features) & 7380 NETIF_F_HW_VLAN_CTAG_FILTER) && 7381 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 7382 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 7383 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 7384 ret = -EINVAL; 7385 goto err_uninit; 7386 } 7387 7388 ret = -EBUSY; 7389 if (!dev->ifindex) 7390 dev->ifindex = dev_new_index(net); 7391 else if (__dev_get_by_index(net, dev->ifindex)) 7392 goto err_uninit; 7393 7394 /* Transfer changeable features to wanted_features and enable 7395 * software offloads (GSO and GRO). 7396 */ 7397 dev->hw_features |= NETIF_F_SOFT_FEATURES; 7398 dev->features |= NETIF_F_SOFT_FEATURES; 7399 dev->wanted_features = dev->features & dev->hw_features; 7400 7401 if (!(dev->flags & IFF_LOOPBACK)) 7402 dev->hw_features |= NETIF_F_NOCACHE_COPY; 7403 7404 /* If IPv4 TCP segmentation offload is supported we should also 7405 * allow the device to enable segmenting the frame with the option 7406 * of ignoring a static IP ID value. This doesn't enable the 7407 * feature itself but allows the user to enable it later. 7408 */ 7409 if (dev->hw_features & NETIF_F_TSO) 7410 dev->hw_features |= NETIF_F_TSO_MANGLEID; 7411 if (dev->vlan_features & NETIF_F_TSO) 7412 dev->vlan_features |= NETIF_F_TSO_MANGLEID; 7413 if (dev->mpls_features & NETIF_F_TSO) 7414 dev->mpls_features |= NETIF_F_TSO_MANGLEID; 7415 if (dev->hw_enc_features & NETIF_F_TSO) 7416 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; 7417 7418 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 7419 */ 7420 dev->vlan_features |= NETIF_F_HIGHDMA; 7421 7422 /* Make NETIF_F_SG inheritable to tunnel devices. 7423 */ 7424 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; 7425 7426 /* Make NETIF_F_SG inheritable to MPLS. 7427 */ 7428 dev->mpls_features |= NETIF_F_SG; 7429 7430 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 7431 ret = notifier_to_errno(ret); 7432 if (ret) 7433 goto err_uninit; 7434 7435 ret = netdev_register_kobject(dev); 7436 if (ret) 7437 goto err_uninit; 7438 dev->reg_state = NETREG_REGISTERED; 7439 7440 __netdev_update_features(dev); 7441 7442 /* 7443 * Default initial state at registry is that the 7444 * device is present. 7445 */ 7446 7447 set_bit(__LINK_STATE_PRESENT, &dev->state); 7448 7449 linkwatch_init_dev(dev); 7450 7451 dev_init_scheduler(dev); 7452 dev_hold(dev); 7453 list_netdevice(dev); 7454 add_device_randomness(dev->dev_addr, dev->addr_len); 7455 7456 /* If the device has permanent device address, driver should 7457 * set dev_addr and also addr_assign_type should be set to 7458 * NET_ADDR_PERM (default value). 7459 */ 7460 if (dev->addr_assign_type == NET_ADDR_PERM) 7461 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 7462 7463 /* Notify protocols, that a new device appeared. */ 7464 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 7465 ret = notifier_to_errno(ret); 7466 if (ret) { 7467 rollback_registered(dev); 7468 dev->reg_state = NETREG_UNREGISTERED; 7469 } 7470 /* 7471 * Prevent userspace races by waiting until the network 7472 * device is fully setup before sending notifications. 7473 */ 7474 if (!dev->rtnl_link_ops || 7475 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 7476 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7477 7478 out: 7479 return ret; 7480 7481 err_uninit: 7482 if (dev->netdev_ops->ndo_uninit) 7483 dev->netdev_ops->ndo_uninit(dev); 7484 goto out; 7485 } 7486 EXPORT_SYMBOL(register_netdevice); 7487 7488 /** 7489 * init_dummy_netdev - init a dummy network device for NAPI 7490 * @dev: device to init 7491 * 7492 * This takes a network device structure and initialize the minimum 7493 * amount of fields so it can be used to schedule NAPI polls without 7494 * registering a full blown interface. This is to be used by drivers 7495 * that need to tie several hardware interfaces to a single NAPI 7496 * poll scheduler due to HW limitations. 7497 */ 7498 int init_dummy_netdev(struct net_device *dev) 7499 { 7500 /* Clear everything. Note we don't initialize spinlocks 7501 * are they aren't supposed to be taken by any of the 7502 * NAPI code and this dummy netdev is supposed to be 7503 * only ever used for NAPI polls 7504 */ 7505 memset(dev, 0, sizeof(struct net_device)); 7506 7507 /* make sure we BUG if trying to hit standard 7508 * register/unregister code path 7509 */ 7510 dev->reg_state = NETREG_DUMMY; 7511 7512 /* NAPI wants this */ 7513 INIT_LIST_HEAD(&dev->napi_list); 7514 7515 /* a dummy interface is started by default */ 7516 set_bit(__LINK_STATE_PRESENT, &dev->state); 7517 set_bit(__LINK_STATE_START, &dev->state); 7518 7519 /* Note : We dont allocate pcpu_refcnt for dummy devices, 7520 * because users of this 'device' dont need to change 7521 * its refcount. 7522 */ 7523 7524 return 0; 7525 } 7526 EXPORT_SYMBOL_GPL(init_dummy_netdev); 7527 7528 7529 /** 7530 * register_netdev - register a network device 7531 * @dev: device to register 7532 * 7533 * Take a completed network device structure and add it to the kernel 7534 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 7535 * chain. 0 is returned on success. A negative errno code is returned 7536 * on a failure to set up the device, or if the name is a duplicate. 7537 * 7538 * This is a wrapper around register_netdevice that takes the rtnl semaphore 7539 * and expands the device name if you passed a format string to 7540 * alloc_netdev. 7541 */ 7542 int register_netdev(struct net_device *dev) 7543 { 7544 int err; 7545 7546 rtnl_lock(); 7547 err = register_netdevice(dev); 7548 rtnl_unlock(); 7549 return err; 7550 } 7551 EXPORT_SYMBOL(register_netdev); 7552 7553 int netdev_refcnt_read(const struct net_device *dev) 7554 { 7555 int i, refcnt = 0; 7556 7557 for_each_possible_cpu(i) 7558 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 7559 return refcnt; 7560 } 7561 EXPORT_SYMBOL(netdev_refcnt_read); 7562 7563 /** 7564 * netdev_wait_allrefs - wait until all references are gone. 7565 * @dev: target net_device 7566 * 7567 * This is called when unregistering network devices. 7568 * 7569 * Any protocol or device that holds a reference should register 7570 * for netdevice notification, and cleanup and put back the 7571 * reference if they receive an UNREGISTER event. 7572 * We can get stuck here if buggy protocols don't correctly 7573 * call dev_put. 7574 */ 7575 static void netdev_wait_allrefs(struct net_device *dev) 7576 { 7577 unsigned long rebroadcast_time, warning_time; 7578 int refcnt; 7579 7580 linkwatch_forget_dev(dev); 7581 7582 rebroadcast_time = warning_time = jiffies; 7583 refcnt = netdev_refcnt_read(dev); 7584 7585 while (refcnt != 0) { 7586 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 7587 rtnl_lock(); 7588 7589 /* Rebroadcast unregister notification */ 7590 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7591 7592 __rtnl_unlock(); 7593 rcu_barrier(); 7594 rtnl_lock(); 7595 7596 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7597 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 7598 &dev->state)) { 7599 /* We must not have linkwatch events 7600 * pending on unregister. If this 7601 * happens, we simply run the queue 7602 * unscheduled, resulting in a noop 7603 * for this device. 7604 */ 7605 linkwatch_run_queue(); 7606 } 7607 7608 __rtnl_unlock(); 7609 7610 rebroadcast_time = jiffies; 7611 } 7612 7613 msleep(250); 7614 7615 refcnt = netdev_refcnt_read(dev); 7616 7617 if (time_after(jiffies, warning_time + 10 * HZ)) { 7618 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 7619 dev->name, refcnt); 7620 warning_time = jiffies; 7621 } 7622 } 7623 } 7624 7625 /* The sequence is: 7626 * 7627 * rtnl_lock(); 7628 * ... 7629 * register_netdevice(x1); 7630 * register_netdevice(x2); 7631 * ... 7632 * unregister_netdevice(y1); 7633 * unregister_netdevice(y2); 7634 * ... 7635 * rtnl_unlock(); 7636 * free_netdev(y1); 7637 * free_netdev(y2); 7638 * 7639 * We are invoked by rtnl_unlock(). 7640 * This allows us to deal with problems: 7641 * 1) We can delete sysfs objects which invoke hotplug 7642 * without deadlocking with linkwatch via keventd. 7643 * 2) Since we run with the RTNL semaphore not held, we can sleep 7644 * safely in order to wait for the netdev refcnt to drop to zero. 7645 * 7646 * We must not return until all unregister events added during 7647 * the interval the lock was held have been completed. 7648 */ 7649 void netdev_run_todo(void) 7650 { 7651 struct list_head list; 7652 7653 /* Snapshot list, allow later requests */ 7654 list_replace_init(&net_todo_list, &list); 7655 7656 __rtnl_unlock(); 7657 7658 7659 /* Wait for rcu callbacks to finish before next phase */ 7660 if (!list_empty(&list)) 7661 rcu_barrier(); 7662 7663 while (!list_empty(&list)) { 7664 struct net_device *dev 7665 = list_first_entry(&list, struct net_device, todo_list); 7666 list_del(&dev->todo_list); 7667 7668 rtnl_lock(); 7669 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7670 __rtnl_unlock(); 7671 7672 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 7673 pr_err("network todo '%s' but state %d\n", 7674 dev->name, dev->reg_state); 7675 dump_stack(); 7676 continue; 7677 } 7678 7679 dev->reg_state = NETREG_UNREGISTERED; 7680 7681 netdev_wait_allrefs(dev); 7682 7683 /* paranoia */ 7684 BUG_ON(netdev_refcnt_read(dev)); 7685 BUG_ON(!list_empty(&dev->ptype_all)); 7686 BUG_ON(!list_empty(&dev->ptype_specific)); 7687 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 7688 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 7689 WARN_ON(dev->dn_ptr); 7690 7691 if (dev->destructor) 7692 dev->destructor(dev); 7693 7694 /* Report a network device has been unregistered */ 7695 rtnl_lock(); 7696 dev_net(dev)->dev_unreg_count--; 7697 __rtnl_unlock(); 7698 wake_up(&netdev_unregistering_wq); 7699 7700 /* Free network device */ 7701 kobject_put(&dev->dev.kobj); 7702 } 7703 } 7704 7705 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has 7706 * all the same fields in the same order as net_device_stats, with only 7707 * the type differing, but rtnl_link_stats64 may have additional fields 7708 * at the end for newer counters. 7709 */ 7710 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 7711 const struct net_device_stats *netdev_stats) 7712 { 7713 #if BITS_PER_LONG == 64 7714 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); 7715 memcpy(stats64, netdev_stats, sizeof(*stats64)); 7716 /* zero out counters that only exist in rtnl_link_stats64 */ 7717 memset((char *)stats64 + sizeof(*netdev_stats), 0, 7718 sizeof(*stats64) - sizeof(*netdev_stats)); 7719 #else 7720 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); 7721 const unsigned long *src = (const unsigned long *)netdev_stats; 7722 u64 *dst = (u64 *)stats64; 7723 7724 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); 7725 for (i = 0; i < n; i++) 7726 dst[i] = src[i]; 7727 /* zero out counters that only exist in rtnl_link_stats64 */ 7728 memset((char *)stats64 + n * sizeof(u64), 0, 7729 sizeof(*stats64) - n * sizeof(u64)); 7730 #endif 7731 } 7732 EXPORT_SYMBOL(netdev_stats_to_stats64); 7733 7734 /** 7735 * dev_get_stats - get network device statistics 7736 * @dev: device to get statistics from 7737 * @storage: place to store stats 7738 * 7739 * Get network statistics from device. Return @storage. 7740 * The device driver may provide its own method by setting 7741 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 7742 * otherwise the internal statistics structure is used. 7743 */ 7744 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 7745 struct rtnl_link_stats64 *storage) 7746 { 7747 const struct net_device_ops *ops = dev->netdev_ops; 7748 7749 if (ops->ndo_get_stats64) { 7750 memset(storage, 0, sizeof(*storage)); 7751 ops->ndo_get_stats64(dev, storage); 7752 } else if (ops->ndo_get_stats) { 7753 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 7754 } else { 7755 netdev_stats_to_stats64(storage, &dev->stats); 7756 } 7757 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 7758 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 7759 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); 7760 return storage; 7761 } 7762 EXPORT_SYMBOL(dev_get_stats); 7763 7764 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 7765 { 7766 struct netdev_queue *queue = dev_ingress_queue(dev); 7767 7768 #ifdef CONFIG_NET_CLS_ACT 7769 if (queue) 7770 return queue; 7771 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 7772 if (!queue) 7773 return NULL; 7774 netdev_init_one_queue(dev, queue, NULL); 7775 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 7776 queue->qdisc_sleeping = &noop_qdisc; 7777 rcu_assign_pointer(dev->ingress_queue, queue); 7778 #endif 7779 return queue; 7780 } 7781 7782 static const struct ethtool_ops default_ethtool_ops; 7783 7784 void netdev_set_default_ethtool_ops(struct net_device *dev, 7785 const struct ethtool_ops *ops) 7786 { 7787 if (dev->ethtool_ops == &default_ethtool_ops) 7788 dev->ethtool_ops = ops; 7789 } 7790 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 7791 7792 void netdev_freemem(struct net_device *dev) 7793 { 7794 char *addr = (char *)dev - dev->padded; 7795 7796 kvfree(addr); 7797 } 7798 7799 /** 7800 * alloc_netdev_mqs - allocate network device 7801 * @sizeof_priv: size of private data to allocate space for 7802 * @name: device name format string 7803 * @name_assign_type: origin of device name 7804 * @setup: callback to initialize device 7805 * @txqs: the number of TX subqueues to allocate 7806 * @rxqs: the number of RX subqueues to allocate 7807 * 7808 * Allocates a struct net_device with private data area for driver use 7809 * and performs basic initialization. Also allocates subqueue structs 7810 * for each queue on the device. 7811 */ 7812 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 7813 unsigned char name_assign_type, 7814 void (*setup)(struct net_device *), 7815 unsigned int txqs, unsigned int rxqs) 7816 { 7817 struct net_device *dev; 7818 size_t alloc_size; 7819 struct net_device *p; 7820 7821 BUG_ON(strlen(name) >= sizeof(dev->name)); 7822 7823 if (txqs < 1) { 7824 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 7825 return NULL; 7826 } 7827 7828 #ifdef CONFIG_SYSFS 7829 if (rxqs < 1) { 7830 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 7831 return NULL; 7832 } 7833 #endif 7834 7835 alloc_size = sizeof(struct net_device); 7836 if (sizeof_priv) { 7837 /* ensure 32-byte alignment of private area */ 7838 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 7839 alloc_size += sizeof_priv; 7840 } 7841 /* ensure 32-byte alignment of whole construct */ 7842 alloc_size += NETDEV_ALIGN - 1; 7843 7844 p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT); 7845 if (!p) 7846 return NULL; 7847 7848 dev = PTR_ALIGN(p, NETDEV_ALIGN); 7849 dev->padded = (char *)dev - (char *)p; 7850 7851 dev->pcpu_refcnt = alloc_percpu(int); 7852 if (!dev->pcpu_refcnt) 7853 goto free_dev; 7854 7855 if (dev_addr_init(dev)) 7856 goto free_pcpu; 7857 7858 dev_mc_init(dev); 7859 dev_uc_init(dev); 7860 7861 dev_net_set(dev, &init_net); 7862 7863 dev->gso_max_size = GSO_MAX_SIZE; 7864 dev->gso_max_segs = GSO_MAX_SEGS; 7865 7866 INIT_LIST_HEAD(&dev->napi_list); 7867 INIT_LIST_HEAD(&dev->unreg_list); 7868 INIT_LIST_HEAD(&dev->close_list); 7869 INIT_LIST_HEAD(&dev->link_watch_list); 7870 INIT_LIST_HEAD(&dev->adj_list.upper); 7871 INIT_LIST_HEAD(&dev->adj_list.lower); 7872 INIT_LIST_HEAD(&dev->ptype_all); 7873 INIT_LIST_HEAD(&dev->ptype_specific); 7874 #ifdef CONFIG_NET_SCHED 7875 hash_init(dev->qdisc_hash); 7876 #endif 7877 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 7878 setup(dev); 7879 7880 if (!dev->tx_queue_len) { 7881 dev->priv_flags |= IFF_NO_QUEUE; 7882 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; 7883 } 7884 7885 dev->num_tx_queues = txqs; 7886 dev->real_num_tx_queues = txqs; 7887 if (netif_alloc_netdev_queues(dev)) 7888 goto free_all; 7889 7890 #ifdef CONFIG_SYSFS 7891 dev->num_rx_queues = rxqs; 7892 dev->real_num_rx_queues = rxqs; 7893 if (netif_alloc_rx_queues(dev)) 7894 goto free_all; 7895 #endif 7896 7897 strcpy(dev->name, name); 7898 dev->name_assign_type = name_assign_type; 7899 dev->group = INIT_NETDEV_GROUP; 7900 if (!dev->ethtool_ops) 7901 dev->ethtool_ops = &default_ethtool_ops; 7902 7903 nf_hook_ingress_init(dev); 7904 7905 return dev; 7906 7907 free_all: 7908 free_netdev(dev); 7909 return NULL; 7910 7911 free_pcpu: 7912 free_percpu(dev->pcpu_refcnt); 7913 free_dev: 7914 netdev_freemem(dev); 7915 return NULL; 7916 } 7917 EXPORT_SYMBOL(alloc_netdev_mqs); 7918 7919 /** 7920 * free_netdev - free network device 7921 * @dev: device 7922 * 7923 * This function does the last stage of destroying an allocated device 7924 * interface. The reference to the device object is released. If this 7925 * is the last reference then it will be freed.Must be called in process 7926 * context. 7927 */ 7928 void free_netdev(struct net_device *dev) 7929 { 7930 struct napi_struct *p, *n; 7931 struct bpf_prog *prog; 7932 7933 might_sleep(); 7934 netif_free_tx_queues(dev); 7935 #ifdef CONFIG_SYSFS 7936 kvfree(dev->_rx); 7937 #endif 7938 7939 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 7940 7941 /* Flush device addresses */ 7942 dev_addr_flush(dev); 7943 7944 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 7945 netif_napi_del(p); 7946 7947 free_percpu(dev->pcpu_refcnt); 7948 dev->pcpu_refcnt = NULL; 7949 7950 prog = rcu_dereference_protected(dev->xdp_prog, 1); 7951 if (prog) { 7952 bpf_prog_put(prog); 7953 static_key_slow_dec(&generic_xdp_needed); 7954 } 7955 7956 /* Compatibility with error handling in drivers */ 7957 if (dev->reg_state == NETREG_UNINITIALIZED) { 7958 netdev_freemem(dev); 7959 return; 7960 } 7961 7962 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 7963 dev->reg_state = NETREG_RELEASED; 7964 7965 /* will free via device release */ 7966 put_device(&dev->dev); 7967 } 7968 EXPORT_SYMBOL(free_netdev); 7969 7970 /** 7971 * synchronize_net - Synchronize with packet receive processing 7972 * 7973 * Wait for packets currently being received to be done. 7974 * Does not block later packets from starting. 7975 */ 7976 void synchronize_net(void) 7977 { 7978 might_sleep(); 7979 if (rtnl_is_locked()) 7980 synchronize_rcu_expedited(); 7981 else 7982 synchronize_rcu(); 7983 } 7984 EXPORT_SYMBOL(synchronize_net); 7985 7986 /** 7987 * unregister_netdevice_queue - remove device from the kernel 7988 * @dev: device 7989 * @head: list 7990 * 7991 * This function shuts down a device interface and removes it 7992 * from the kernel tables. 7993 * If head not NULL, device is queued to be unregistered later. 7994 * 7995 * Callers must hold the rtnl semaphore. You may want 7996 * unregister_netdev() instead of this. 7997 */ 7998 7999 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 8000 { 8001 ASSERT_RTNL(); 8002 8003 if (head) { 8004 list_move_tail(&dev->unreg_list, head); 8005 } else { 8006 rollback_registered(dev); 8007 /* Finish processing unregister after unlock */ 8008 net_set_todo(dev); 8009 } 8010 } 8011 EXPORT_SYMBOL(unregister_netdevice_queue); 8012 8013 /** 8014 * unregister_netdevice_many - unregister many devices 8015 * @head: list of devices 8016 * 8017 * Note: As most callers use a stack allocated list_head, 8018 * we force a list_del() to make sure stack wont be corrupted later. 8019 */ 8020 void unregister_netdevice_many(struct list_head *head) 8021 { 8022 struct net_device *dev; 8023 8024 if (!list_empty(head)) { 8025 rollback_registered_many(head); 8026 list_for_each_entry(dev, head, unreg_list) 8027 net_set_todo(dev); 8028 list_del(head); 8029 } 8030 } 8031 EXPORT_SYMBOL(unregister_netdevice_many); 8032 8033 /** 8034 * unregister_netdev - remove device from the kernel 8035 * @dev: device 8036 * 8037 * This function shuts down a device interface and removes it 8038 * from the kernel tables. 8039 * 8040 * This is just a wrapper for unregister_netdevice that takes 8041 * the rtnl semaphore. In general you want to use this and not 8042 * unregister_netdevice. 8043 */ 8044 void unregister_netdev(struct net_device *dev) 8045 { 8046 rtnl_lock(); 8047 unregister_netdevice(dev); 8048 rtnl_unlock(); 8049 } 8050 EXPORT_SYMBOL(unregister_netdev); 8051 8052 /** 8053 * dev_change_net_namespace - move device to different nethost namespace 8054 * @dev: device 8055 * @net: network namespace 8056 * @pat: If not NULL name pattern to try if the current device name 8057 * is already taken in the destination network namespace. 8058 * 8059 * This function shuts down a device interface and moves it 8060 * to a new network namespace. On success 0 is returned, on 8061 * a failure a netagive errno code is returned. 8062 * 8063 * Callers must hold the rtnl semaphore. 8064 */ 8065 8066 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 8067 { 8068 int err; 8069 8070 ASSERT_RTNL(); 8071 8072 /* Don't allow namespace local devices to be moved. */ 8073 err = -EINVAL; 8074 if (dev->features & NETIF_F_NETNS_LOCAL) 8075 goto out; 8076 8077 /* Ensure the device has been registrered */ 8078 if (dev->reg_state != NETREG_REGISTERED) 8079 goto out; 8080 8081 /* Get out if there is nothing todo */ 8082 err = 0; 8083 if (net_eq(dev_net(dev), net)) 8084 goto out; 8085 8086 /* Pick the destination device name, and ensure 8087 * we can use it in the destination network namespace. 8088 */ 8089 err = -EEXIST; 8090 if (__dev_get_by_name(net, dev->name)) { 8091 /* We get here if we can't use the current device name */ 8092 if (!pat) 8093 goto out; 8094 if (dev_get_valid_name(net, dev, pat) < 0) 8095 goto out; 8096 } 8097 8098 /* 8099 * And now a mini version of register_netdevice unregister_netdevice. 8100 */ 8101 8102 /* If device is running close it first. */ 8103 dev_close(dev); 8104 8105 /* And unlink it from device chain */ 8106 err = -ENODEV; 8107 unlist_netdevice(dev); 8108 8109 synchronize_net(); 8110 8111 /* Shutdown queueing discipline. */ 8112 dev_shutdown(dev); 8113 8114 /* Notify protocols, that we are about to destroy 8115 * this device. They should clean all the things. 8116 * 8117 * Note that dev->reg_state stays at NETREG_REGISTERED. 8118 * This is wanted because this way 8021q and macvlan know 8119 * the device is just moving and can keep their slaves up. 8120 */ 8121 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 8122 rcu_barrier(); 8123 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 8124 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 8125 8126 /* 8127 * Flush the unicast and multicast chains 8128 */ 8129 dev_uc_flush(dev); 8130 dev_mc_flush(dev); 8131 8132 /* Send a netdev-removed uevent to the old namespace */ 8133 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 8134 netdev_adjacent_del_links(dev); 8135 8136 /* Actually switch the network namespace */ 8137 dev_net_set(dev, net); 8138 8139 /* If there is an ifindex conflict assign a new one */ 8140 if (__dev_get_by_index(net, dev->ifindex)) 8141 dev->ifindex = dev_new_index(net); 8142 8143 /* Send a netdev-add uevent to the new namespace */ 8144 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 8145 netdev_adjacent_add_links(dev); 8146 8147 /* Fixup kobjects */ 8148 err = device_rename(&dev->dev, dev->name); 8149 WARN_ON(err); 8150 8151 /* Add the device back in the hashes */ 8152 list_netdevice(dev); 8153 8154 /* Notify protocols, that a new device appeared. */ 8155 call_netdevice_notifiers(NETDEV_REGISTER, dev); 8156 8157 /* 8158 * Prevent userspace races by waiting until the network 8159 * device is fully setup before sending notifications. 8160 */ 8161 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 8162 8163 synchronize_net(); 8164 err = 0; 8165 out: 8166 return err; 8167 } 8168 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 8169 8170 static int dev_cpu_dead(unsigned int oldcpu) 8171 { 8172 struct sk_buff **list_skb; 8173 struct sk_buff *skb; 8174 unsigned int cpu; 8175 struct softnet_data *sd, *oldsd; 8176 8177 local_irq_disable(); 8178 cpu = smp_processor_id(); 8179 sd = &per_cpu(softnet_data, cpu); 8180 oldsd = &per_cpu(softnet_data, oldcpu); 8181 8182 /* Find end of our completion_queue. */ 8183 list_skb = &sd->completion_queue; 8184 while (*list_skb) 8185 list_skb = &(*list_skb)->next; 8186 /* Append completion queue from offline CPU. */ 8187 *list_skb = oldsd->completion_queue; 8188 oldsd->completion_queue = NULL; 8189 8190 /* Append output queue from offline CPU. */ 8191 if (oldsd->output_queue) { 8192 *sd->output_queue_tailp = oldsd->output_queue; 8193 sd->output_queue_tailp = oldsd->output_queue_tailp; 8194 oldsd->output_queue = NULL; 8195 oldsd->output_queue_tailp = &oldsd->output_queue; 8196 } 8197 /* Append NAPI poll list from offline CPU, with one exception : 8198 * process_backlog() must be called by cpu owning percpu backlog. 8199 * We properly handle process_queue & input_pkt_queue later. 8200 */ 8201 while (!list_empty(&oldsd->poll_list)) { 8202 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 8203 struct napi_struct, 8204 poll_list); 8205 8206 list_del_init(&napi->poll_list); 8207 if (napi->poll == process_backlog) 8208 napi->state = 0; 8209 else 8210 ____napi_schedule(sd, napi); 8211 } 8212 8213 raise_softirq_irqoff(NET_TX_SOFTIRQ); 8214 local_irq_enable(); 8215 8216 /* Process offline CPU's input_pkt_queue */ 8217 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 8218 netif_rx_ni(skb); 8219 input_queue_head_incr(oldsd); 8220 } 8221 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 8222 netif_rx_ni(skb); 8223 input_queue_head_incr(oldsd); 8224 } 8225 8226 return 0; 8227 } 8228 8229 /** 8230 * netdev_increment_features - increment feature set by one 8231 * @all: current feature set 8232 * @one: new feature set 8233 * @mask: mask feature set 8234 * 8235 * Computes a new feature set after adding a device with feature set 8236 * @one to the master device with current feature set @all. Will not 8237 * enable anything that is off in @mask. Returns the new feature set. 8238 */ 8239 netdev_features_t netdev_increment_features(netdev_features_t all, 8240 netdev_features_t one, netdev_features_t mask) 8241 { 8242 if (mask & NETIF_F_HW_CSUM) 8243 mask |= NETIF_F_CSUM_MASK; 8244 mask |= NETIF_F_VLAN_CHALLENGED; 8245 8246 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; 8247 all &= one | ~NETIF_F_ALL_FOR_ALL; 8248 8249 /* If one device supports hw checksumming, set for all. */ 8250 if (all & NETIF_F_HW_CSUM) 8251 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); 8252 8253 return all; 8254 } 8255 EXPORT_SYMBOL(netdev_increment_features); 8256 8257 static struct hlist_head * __net_init netdev_create_hash(void) 8258 { 8259 int i; 8260 struct hlist_head *hash; 8261 8262 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 8263 if (hash != NULL) 8264 for (i = 0; i < NETDEV_HASHENTRIES; i++) 8265 INIT_HLIST_HEAD(&hash[i]); 8266 8267 return hash; 8268 } 8269 8270 /* Initialize per network namespace state */ 8271 static int __net_init netdev_init(struct net *net) 8272 { 8273 if (net != &init_net) 8274 INIT_LIST_HEAD(&net->dev_base_head); 8275 8276 net->dev_name_head = netdev_create_hash(); 8277 if (net->dev_name_head == NULL) 8278 goto err_name; 8279 8280 net->dev_index_head = netdev_create_hash(); 8281 if (net->dev_index_head == NULL) 8282 goto err_idx; 8283 8284 return 0; 8285 8286 err_idx: 8287 kfree(net->dev_name_head); 8288 err_name: 8289 return -ENOMEM; 8290 } 8291 8292 /** 8293 * netdev_drivername - network driver for the device 8294 * @dev: network device 8295 * 8296 * Determine network driver for device. 8297 */ 8298 const char *netdev_drivername(const struct net_device *dev) 8299 { 8300 const struct device_driver *driver; 8301 const struct device *parent; 8302 const char *empty = ""; 8303 8304 parent = dev->dev.parent; 8305 if (!parent) 8306 return empty; 8307 8308 driver = parent->driver; 8309 if (driver && driver->name) 8310 return driver->name; 8311 return empty; 8312 } 8313 8314 static void __netdev_printk(const char *level, const struct net_device *dev, 8315 struct va_format *vaf) 8316 { 8317 if (dev && dev->dev.parent) { 8318 dev_printk_emit(level[1] - '0', 8319 dev->dev.parent, 8320 "%s %s %s%s: %pV", 8321 dev_driver_string(dev->dev.parent), 8322 dev_name(dev->dev.parent), 8323 netdev_name(dev), netdev_reg_state(dev), 8324 vaf); 8325 } else if (dev) { 8326 printk("%s%s%s: %pV", 8327 level, netdev_name(dev), netdev_reg_state(dev), vaf); 8328 } else { 8329 printk("%s(NULL net_device): %pV", level, vaf); 8330 } 8331 } 8332 8333 void netdev_printk(const char *level, const struct net_device *dev, 8334 const char *format, ...) 8335 { 8336 struct va_format vaf; 8337 va_list args; 8338 8339 va_start(args, format); 8340 8341 vaf.fmt = format; 8342 vaf.va = &args; 8343 8344 __netdev_printk(level, dev, &vaf); 8345 8346 va_end(args); 8347 } 8348 EXPORT_SYMBOL(netdev_printk); 8349 8350 #define define_netdev_printk_level(func, level) \ 8351 void func(const struct net_device *dev, const char *fmt, ...) \ 8352 { \ 8353 struct va_format vaf; \ 8354 va_list args; \ 8355 \ 8356 va_start(args, fmt); \ 8357 \ 8358 vaf.fmt = fmt; \ 8359 vaf.va = &args; \ 8360 \ 8361 __netdev_printk(level, dev, &vaf); \ 8362 \ 8363 va_end(args); \ 8364 } \ 8365 EXPORT_SYMBOL(func); 8366 8367 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 8368 define_netdev_printk_level(netdev_alert, KERN_ALERT); 8369 define_netdev_printk_level(netdev_crit, KERN_CRIT); 8370 define_netdev_printk_level(netdev_err, KERN_ERR); 8371 define_netdev_printk_level(netdev_warn, KERN_WARNING); 8372 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 8373 define_netdev_printk_level(netdev_info, KERN_INFO); 8374 8375 static void __net_exit netdev_exit(struct net *net) 8376 { 8377 kfree(net->dev_name_head); 8378 kfree(net->dev_index_head); 8379 } 8380 8381 static struct pernet_operations __net_initdata netdev_net_ops = { 8382 .init = netdev_init, 8383 .exit = netdev_exit, 8384 }; 8385 8386 static void __net_exit default_device_exit(struct net *net) 8387 { 8388 struct net_device *dev, *aux; 8389 /* 8390 * Push all migratable network devices back to the 8391 * initial network namespace 8392 */ 8393 rtnl_lock(); 8394 for_each_netdev_safe(net, dev, aux) { 8395 int err; 8396 char fb_name[IFNAMSIZ]; 8397 8398 /* Ignore unmoveable devices (i.e. loopback) */ 8399 if (dev->features & NETIF_F_NETNS_LOCAL) 8400 continue; 8401 8402 /* Leave virtual devices for the generic cleanup */ 8403 if (dev->rtnl_link_ops) 8404 continue; 8405 8406 /* Push remaining network devices to init_net */ 8407 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 8408 err = dev_change_net_namespace(dev, &init_net, fb_name); 8409 if (err) { 8410 pr_emerg("%s: failed to move %s to init_net: %d\n", 8411 __func__, dev->name, err); 8412 BUG(); 8413 } 8414 } 8415 rtnl_unlock(); 8416 } 8417 8418 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 8419 { 8420 /* Return with the rtnl_lock held when there are no network 8421 * devices unregistering in any network namespace in net_list. 8422 */ 8423 struct net *net; 8424 bool unregistering; 8425 DEFINE_WAIT_FUNC(wait, woken_wake_function); 8426 8427 add_wait_queue(&netdev_unregistering_wq, &wait); 8428 for (;;) { 8429 unregistering = false; 8430 rtnl_lock(); 8431 list_for_each_entry(net, net_list, exit_list) { 8432 if (net->dev_unreg_count > 0) { 8433 unregistering = true; 8434 break; 8435 } 8436 } 8437 if (!unregistering) 8438 break; 8439 __rtnl_unlock(); 8440 8441 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 8442 } 8443 remove_wait_queue(&netdev_unregistering_wq, &wait); 8444 } 8445 8446 static void __net_exit default_device_exit_batch(struct list_head *net_list) 8447 { 8448 /* At exit all network devices most be removed from a network 8449 * namespace. Do this in the reverse order of registration. 8450 * Do this across as many network namespaces as possible to 8451 * improve batching efficiency. 8452 */ 8453 struct net_device *dev; 8454 struct net *net; 8455 LIST_HEAD(dev_kill_list); 8456 8457 /* To prevent network device cleanup code from dereferencing 8458 * loopback devices or network devices that have been freed 8459 * wait here for all pending unregistrations to complete, 8460 * before unregistring the loopback device and allowing the 8461 * network namespace be freed. 8462 * 8463 * The netdev todo list containing all network devices 8464 * unregistrations that happen in default_device_exit_batch 8465 * will run in the rtnl_unlock() at the end of 8466 * default_device_exit_batch. 8467 */ 8468 rtnl_lock_unregistering(net_list); 8469 list_for_each_entry(net, net_list, exit_list) { 8470 for_each_netdev_reverse(net, dev) { 8471 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 8472 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 8473 else 8474 unregister_netdevice_queue(dev, &dev_kill_list); 8475 } 8476 } 8477 unregister_netdevice_many(&dev_kill_list); 8478 rtnl_unlock(); 8479 } 8480 8481 static struct pernet_operations __net_initdata default_device_ops = { 8482 .exit = default_device_exit, 8483 .exit_batch = default_device_exit_batch, 8484 }; 8485 8486 /* 8487 * Initialize the DEV module. At boot time this walks the device list and 8488 * unhooks any devices that fail to initialise (normally hardware not 8489 * present) and leaves us with a valid list of present and active devices. 8490 * 8491 */ 8492 8493 /* 8494 * This is called single threaded during boot, so no need 8495 * to take the rtnl semaphore. 8496 */ 8497 static int __init net_dev_init(void) 8498 { 8499 int i, rc = -ENOMEM; 8500 8501 BUG_ON(!dev_boot_phase); 8502 8503 if (dev_proc_init()) 8504 goto out; 8505 8506 if (netdev_kobject_init()) 8507 goto out; 8508 8509 INIT_LIST_HEAD(&ptype_all); 8510 for (i = 0; i < PTYPE_HASH_SIZE; i++) 8511 INIT_LIST_HEAD(&ptype_base[i]); 8512 8513 INIT_LIST_HEAD(&offload_base); 8514 8515 if (register_pernet_subsys(&netdev_net_ops)) 8516 goto out; 8517 8518 /* 8519 * Initialise the packet receive queues. 8520 */ 8521 8522 for_each_possible_cpu(i) { 8523 struct work_struct *flush = per_cpu_ptr(&flush_works, i); 8524 struct softnet_data *sd = &per_cpu(softnet_data, i); 8525 8526 INIT_WORK(flush, flush_backlog); 8527 8528 skb_queue_head_init(&sd->input_pkt_queue); 8529 skb_queue_head_init(&sd->process_queue); 8530 INIT_LIST_HEAD(&sd->poll_list); 8531 sd->output_queue_tailp = &sd->output_queue; 8532 #ifdef CONFIG_RPS 8533 sd->csd.func = rps_trigger_softirq; 8534 sd->csd.info = sd; 8535 sd->cpu = i; 8536 #endif 8537 8538 sd->backlog.poll = process_backlog; 8539 sd->backlog.weight = weight_p; 8540 } 8541 8542 dev_boot_phase = 0; 8543 8544 /* The loopback device is special if any other network devices 8545 * is present in a network namespace the loopback device must 8546 * be present. Since we now dynamically allocate and free the 8547 * loopback device ensure this invariant is maintained by 8548 * keeping the loopback device as the first device on the 8549 * list of network devices. Ensuring the loopback devices 8550 * is the first device that appears and the last network device 8551 * that disappears. 8552 */ 8553 if (register_pernet_device(&loopback_net_ops)) 8554 goto out; 8555 8556 if (register_pernet_device(&default_device_ops)) 8557 goto out; 8558 8559 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 8560 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 8561 8562 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", 8563 NULL, dev_cpu_dead); 8564 WARN_ON(rc < 0); 8565 dst_subsys_init(); 8566 rc = 0; 8567 out: 8568 return rc; 8569 } 8570 8571 subsys_initcall(net_dev_init); 8572