1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <asm/system.h> 77 #include <linux/bitops.h> 78 #include <linux/capability.h> 79 #include <linux/cpu.h> 80 #include <linux/types.h> 81 #include <linux/kernel.h> 82 #include <linux/sched.h> 83 #include <linux/mutex.h> 84 #include <linux/string.h> 85 #include <linux/mm.h> 86 #include <linux/socket.h> 87 #include <linux/sockios.h> 88 #include <linux/errno.h> 89 #include <linux/interrupt.h> 90 #include <linux/if_ether.h> 91 #include <linux/netdevice.h> 92 #include <linux/etherdevice.h> 93 #include <linux/ethtool.h> 94 #include <linux/notifier.h> 95 #include <linux/skbuff.h> 96 #include <net/net_namespace.h> 97 #include <net/sock.h> 98 #include <linux/rtnetlink.h> 99 #include <linux/proc_fs.h> 100 #include <linux/seq_file.h> 101 #include <linux/stat.h> 102 #include <linux/if_bridge.h> 103 #include <linux/if_macvlan.h> 104 #include <net/dst.h> 105 #include <net/pkt_sched.h> 106 #include <net/checksum.h> 107 #include <linux/highmem.h> 108 #include <linux/init.h> 109 #include <linux/kmod.h> 110 #include <linux/module.h> 111 #include <linux/netpoll.h> 112 #include <linux/rcupdate.h> 113 #include <linux/delay.h> 114 #include <net/wext.h> 115 #include <net/iw_handler.h> 116 #include <asm/current.h> 117 #include <linux/audit.h> 118 #include <linux/dmaengine.h> 119 #include <linux/err.h> 120 #include <linux/ctype.h> 121 #include <linux/if_arp.h> 122 #include <linux/if_vlan.h> 123 #include <linux/ip.h> 124 #include <net/ip.h> 125 #include <linux/ipv6.h> 126 #include <linux/in.h> 127 #include <linux/jhash.h> 128 #include <linux/random.h> 129 130 #include "net-sysfs.h" 131 132 /* Instead of increasing this, you should create a hash table. */ 133 #define MAX_GRO_SKBS 8 134 135 /* This should be increased if a protocol with a bigger head is added. */ 136 #define GRO_MAX_HEAD (MAX_HEADER + 128) 137 138 /* 139 * The list of packet types we will receive (as opposed to discard) 140 * and the routines to invoke. 141 * 142 * Why 16. Because with 16 the only overlap we get on a hash of the 143 * low nibble of the protocol value is RARP/SNAP/X.25. 144 * 145 * NOTE: That is no longer true with the addition of VLAN tags. Not 146 * sure which should go first, but I bet it won't make much 147 * difference if we are running VLANs. The good news is that 148 * this protocol won't be in the list unless compiled in, so 149 * the average user (w/out VLANs) will not be adversely affected. 150 * --BLG 151 * 152 * 0800 IP 153 * 8100 802.1Q VLAN 154 * 0001 802.3 155 * 0002 AX.25 156 * 0004 802.2 157 * 8035 RARP 158 * 0005 SNAP 159 * 0805 X.25 160 * 0806 ARP 161 * 8137 IPX 162 * 0009 Localtalk 163 * 86DD IPv6 164 */ 165 166 #define PTYPE_HASH_SIZE (16) 167 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) 168 169 static DEFINE_SPINLOCK(ptype_lock); 170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 171 static struct list_head ptype_all __read_mostly; /* Taps */ 172 173 /* 174 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 175 * semaphore. 176 * 177 * Pure readers hold dev_base_lock for reading. 178 * 179 * Writers must hold the rtnl semaphore while they loop through the 180 * dev_base_head list, and hold dev_base_lock for writing when they do the 181 * actual updates. This allows pure readers to access the list even 182 * while a writer is preparing to update it. 183 * 184 * To put it another way, dev_base_lock is held for writing only to 185 * protect against pure readers; the rtnl semaphore provides the 186 * protection against other writers. 187 * 188 * See, for example usages, register_netdevice() and 189 * unregister_netdevice(), which must be called with the rtnl 190 * semaphore held. 191 */ 192 DEFINE_RWLOCK(dev_base_lock); 193 194 EXPORT_SYMBOL(dev_base_lock); 195 196 #define NETDEV_HASHBITS 8 197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) 198 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 200 { 201 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 202 return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; 203 } 204 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 206 { 207 return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; 208 } 209 210 /* Device list insertion */ 211 static int list_netdevice(struct net_device *dev) 212 { 213 struct net *net = dev_net(dev); 214 215 ASSERT_RTNL(); 216 217 write_lock_bh(&dev_base_lock); 218 list_add_tail(&dev->dev_list, &net->dev_base_head); 219 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); 220 hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); 221 write_unlock_bh(&dev_base_lock); 222 return 0; 223 } 224 225 /* Device list removal */ 226 static void unlist_netdevice(struct net_device *dev) 227 { 228 ASSERT_RTNL(); 229 230 /* Unlink dev from the device chain */ 231 write_lock_bh(&dev_base_lock); 232 list_del(&dev->dev_list); 233 hlist_del(&dev->name_hlist); 234 hlist_del(&dev->index_hlist); 235 write_unlock_bh(&dev_base_lock); 236 } 237 238 /* 239 * Our notifier list 240 */ 241 242 static RAW_NOTIFIER_HEAD(netdev_chain); 243 244 /* 245 * Device drivers call our routines to queue packets here. We empty the 246 * queue in the local softnet handler. 247 */ 248 249 DEFINE_PER_CPU(struct softnet_data, softnet_data); 250 251 #ifdef CONFIG_LOCKDEP 252 /* 253 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 254 * according to dev->type 255 */ 256 static const unsigned short netdev_lock_type[] = 257 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 258 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 259 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 260 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 261 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 262 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 263 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 264 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 265 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 266 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 267 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 268 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 269 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, 270 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, 271 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE}; 272 273 static const char *netdev_lock_name[] = 274 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 275 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 276 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 277 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 278 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 279 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 280 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 281 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 282 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 283 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 284 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 285 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 286 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", 287 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", 288 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"}; 289 290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 292 293 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 294 { 295 int i; 296 297 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 298 if (netdev_lock_type[i] == dev_type) 299 return i; 300 /* the last key is used by default */ 301 return ARRAY_SIZE(netdev_lock_type) - 1; 302 } 303 304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 305 unsigned short dev_type) 306 { 307 int i; 308 309 i = netdev_lock_pos(dev_type); 310 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 311 netdev_lock_name[i]); 312 } 313 314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 315 { 316 int i; 317 318 i = netdev_lock_pos(dev->type); 319 lockdep_set_class_and_name(&dev->addr_list_lock, 320 &netdev_addr_lock_key[i], 321 netdev_lock_name[i]); 322 } 323 #else 324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 325 unsigned short dev_type) 326 { 327 } 328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 329 { 330 } 331 #endif 332 333 /******************************************************************************* 334 335 Protocol management and registration routines 336 337 *******************************************************************************/ 338 339 /* 340 * Add a protocol ID to the list. Now that the input handler is 341 * smarter we can dispense with all the messy stuff that used to be 342 * here. 343 * 344 * BEWARE!!! Protocol handlers, mangling input packets, 345 * MUST BE last in hash buckets and checking protocol handlers 346 * MUST start from promiscuous ptype_all chain in net_bh. 347 * It is true now, do not change it. 348 * Explanation follows: if protocol handler, mangling packet, will 349 * be the first on list, it is not able to sense, that packet 350 * is cloned and should be copied-on-write, so that it will 351 * change it and subsequent readers will get broken packet. 352 * --ANK (980803) 353 */ 354 355 /** 356 * dev_add_pack - add packet handler 357 * @pt: packet type declaration 358 * 359 * Add a protocol handler to the networking stack. The passed &packet_type 360 * is linked into kernel lists and may not be freed until it has been 361 * removed from the kernel lists. 362 * 363 * This call does not sleep therefore it can not 364 * guarantee all CPU's that are in middle of receiving packets 365 * will see the new packet type (until the next received packet). 366 */ 367 368 void dev_add_pack(struct packet_type *pt) 369 { 370 int hash; 371 372 spin_lock_bh(&ptype_lock); 373 if (pt->type == htons(ETH_P_ALL)) 374 list_add_rcu(&pt->list, &ptype_all); 375 else { 376 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 377 list_add_rcu(&pt->list, &ptype_base[hash]); 378 } 379 spin_unlock_bh(&ptype_lock); 380 } 381 382 /** 383 * __dev_remove_pack - remove packet handler 384 * @pt: packet type declaration 385 * 386 * Remove a protocol handler that was previously added to the kernel 387 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 388 * from the kernel lists and can be freed or reused once this function 389 * returns. 390 * 391 * The packet type might still be in use by receivers 392 * and must not be freed until after all the CPU's have gone 393 * through a quiescent state. 394 */ 395 void __dev_remove_pack(struct packet_type *pt) 396 { 397 struct list_head *head; 398 struct packet_type *pt1; 399 400 spin_lock_bh(&ptype_lock); 401 402 if (pt->type == htons(ETH_P_ALL)) 403 head = &ptype_all; 404 else 405 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 406 407 list_for_each_entry(pt1, head, list) { 408 if (pt == pt1) { 409 list_del_rcu(&pt->list); 410 goto out; 411 } 412 } 413 414 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 415 out: 416 spin_unlock_bh(&ptype_lock); 417 } 418 /** 419 * dev_remove_pack - remove packet handler 420 * @pt: packet type declaration 421 * 422 * Remove a protocol handler that was previously added to the kernel 423 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 424 * from the kernel lists and can be freed or reused once this function 425 * returns. 426 * 427 * This call sleeps to guarantee that no CPU is looking at the packet 428 * type after return. 429 */ 430 void dev_remove_pack(struct packet_type *pt) 431 { 432 __dev_remove_pack(pt); 433 434 synchronize_net(); 435 } 436 437 /****************************************************************************** 438 439 Device Boot-time Settings Routines 440 441 *******************************************************************************/ 442 443 /* Boot time configuration table */ 444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 445 446 /** 447 * netdev_boot_setup_add - add new setup entry 448 * @name: name of the device 449 * @map: configured settings for the device 450 * 451 * Adds new setup entry to the dev_boot_setup list. The function 452 * returns 0 on error and 1 on success. This is a generic routine to 453 * all netdevices. 454 */ 455 static int netdev_boot_setup_add(char *name, struct ifmap *map) 456 { 457 struct netdev_boot_setup *s; 458 int i; 459 460 s = dev_boot_setup; 461 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 462 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 463 memset(s[i].name, 0, sizeof(s[i].name)); 464 strlcpy(s[i].name, name, IFNAMSIZ); 465 memcpy(&s[i].map, map, sizeof(s[i].map)); 466 break; 467 } 468 } 469 470 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 471 } 472 473 /** 474 * netdev_boot_setup_check - check boot time settings 475 * @dev: the netdevice 476 * 477 * Check boot time settings for the device. 478 * The found settings are set for the device to be used 479 * later in the device probing. 480 * Returns 0 if no settings found, 1 if they are. 481 */ 482 int netdev_boot_setup_check(struct net_device *dev) 483 { 484 struct netdev_boot_setup *s = dev_boot_setup; 485 int i; 486 487 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 488 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 489 !strcmp(dev->name, s[i].name)) { 490 dev->irq = s[i].map.irq; 491 dev->base_addr = s[i].map.base_addr; 492 dev->mem_start = s[i].map.mem_start; 493 dev->mem_end = s[i].map.mem_end; 494 return 1; 495 } 496 } 497 return 0; 498 } 499 500 501 /** 502 * netdev_boot_base - get address from boot time settings 503 * @prefix: prefix for network device 504 * @unit: id for network device 505 * 506 * Check boot time settings for the base address of device. 507 * The found settings are set for the device to be used 508 * later in the device probing. 509 * Returns 0 if no settings found. 510 */ 511 unsigned long netdev_boot_base(const char *prefix, int unit) 512 { 513 const struct netdev_boot_setup *s = dev_boot_setup; 514 char name[IFNAMSIZ]; 515 int i; 516 517 sprintf(name, "%s%d", prefix, unit); 518 519 /* 520 * If device already registered then return base of 1 521 * to indicate not to probe for this interface 522 */ 523 if (__dev_get_by_name(&init_net, name)) 524 return 1; 525 526 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 527 if (!strcmp(name, s[i].name)) 528 return s[i].map.base_addr; 529 return 0; 530 } 531 532 /* 533 * Saves at boot time configured settings for any netdevice. 534 */ 535 int __init netdev_boot_setup(char *str) 536 { 537 int ints[5]; 538 struct ifmap map; 539 540 str = get_options(str, ARRAY_SIZE(ints), ints); 541 if (!str || !*str) 542 return 0; 543 544 /* Save settings */ 545 memset(&map, 0, sizeof(map)); 546 if (ints[0] > 0) 547 map.irq = ints[1]; 548 if (ints[0] > 1) 549 map.base_addr = ints[2]; 550 if (ints[0] > 2) 551 map.mem_start = ints[3]; 552 if (ints[0] > 3) 553 map.mem_end = ints[4]; 554 555 /* Add new entry to the list */ 556 return netdev_boot_setup_add(str, &map); 557 } 558 559 __setup("netdev=", netdev_boot_setup); 560 561 /******************************************************************************* 562 563 Device Interface Subroutines 564 565 *******************************************************************************/ 566 567 /** 568 * __dev_get_by_name - find a device by its name 569 * @net: the applicable net namespace 570 * @name: name to find 571 * 572 * Find an interface by name. Must be called under RTNL semaphore 573 * or @dev_base_lock. If the name is found a pointer to the device 574 * is returned. If the name is not found then %NULL is returned. The 575 * reference counters are not incremented so the caller must be 576 * careful with locks. 577 */ 578 579 struct net_device *__dev_get_by_name(struct net *net, const char *name) 580 { 581 struct hlist_node *p; 582 583 hlist_for_each(p, dev_name_hash(net, name)) { 584 struct net_device *dev 585 = hlist_entry(p, struct net_device, name_hlist); 586 if (!strncmp(dev->name, name, IFNAMSIZ)) 587 return dev; 588 } 589 return NULL; 590 } 591 592 /** 593 * dev_get_by_name - find a device by its name 594 * @net: the applicable net namespace 595 * @name: name to find 596 * 597 * Find an interface by name. This can be called from any 598 * context and does its own locking. The returned handle has 599 * the usage count incremented and the caller must use dev_put() to 600 * release it when it is no longer needed. %NULL is returned if no 601 * matching device is found. 602 */ 603 604 struct net_device *dev_get_by_name(struct net *net, const char *name) 605 { 606 struct net_device *dev; 607 608 read_lock(&dev_base_lock); 609 dev = __dev_get_by_name(net, name); 610 if (dev) 611 dev_hold(dev); 612 read_unlock(&dev_base_lock); 613 return dev; 614 } 615 616 /** 617 * __dev_get_by_index - find a device by its ifindex 618 * @net: the applicable net namespace 619 * @ifindex: index of device 620 * 621 * Search for an interface by index. Returns %NULL if the device 622 * is not found or a pointer to the device. The device has not 623 * had its reference counter increased so the caller must be careful 624 * about locking. The caller must hold either the RTNL semaphore 625 * or @dev_base_lock. 626 */ 627 628 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 629 { 630 struct hlist_node *p; 631 632 hlist_for_each(p, dev_index_hash(net, ifindex)) { 633 struct net_device *dev 634 = hlist_entry(p, struct net_device, index_hlist); 635 if (dev->ifindex == ifindex) 636 return dev; 637 } 638 return NULL; 639 } 640 641 642 /** 643 * dev_get_by_index - find a device by its ifindex 644 * @net: the applicable net namespace 645 * @ifindex: index of device 646 * 647 * Search for an interface by index. Returns NULL if the device 648 * is not found or a pointer to the device. The device returned has 649 * had a reference added and the pointer is safe until the user calls 650 * dev_put to indicate they have finished with it. 651 */ 652 653 struct net_device *dev_get_by_index(struct net *net, int ifindex) 654 { 655 struct net_device *dev; 656 657 read_lock(&dev_base_lock); 658 dev = __dev_get_by_index(net, ifindex); 659 if (dev) 660 dev_hold(dev); 661 read_unlock(&dev_base_lock); 662 return dev; 663 } 664 665 /** 666 * dev_getbyhwaddr - find a device by its hardware address 667 * @net: the applicable net namespace 668 * @type: media type of device 669 * @ha: hardware address 670 * 671 * Search for an interface by MAC address. Returns NULL if the device 672 * is not found or a pointer to the device. The caller must hold the 673 * rtnl semaphore. The returned device has not had its ref count increased 674 * and the caller must therefore be careful about locking 675 * 676 * BUGS: 677 * If the API was consistent this would be __dev_get_by_hwaddr 678 */ 679 680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) 681 { 682 struct net_device *dev; 683 684 ASSERT_RTNL(); 685 686 for_each_netdev(net, dev) 687 if (dev->type == type && 688 !memcmp(dev->dev_addr, ha, dev->addr_len)) 689 return dev; 690 691 return NULL; 692 } 693 694 EXPORT_SYMBOL(dev_getbyhwaddr); 695 696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 697 { 698 struct net_device *dev; 699 700 ASSERT_RTNL(); 701 for_each_netdev(net, dev) 702 if (dev->type == type) 703 return dev; 704 705 return NULL; 706 } 707 708 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 709 710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 711 { 712 struct net_device *dev; 713 714 rtnl_lock(); 715 dev = __dev_getfirstbyhwtype(net, type); 716 if (dev) 717 dev_hold(dev); 718 rtnl_unlock(); 719 return dev; 720 } 721 722 EXPORT_SYMBOL(dev_getfirstbyhwtype); 723 724 /** 725 * dev_get_by_flags - find any device with given flags 726 * @net: the applicable net namespace 727 * @if_flags: IFF_* values 728 * @mask: bitmask of bits in if_flags to check 729 * 730 * Search for any interface with the given flags. Returns NULL if a device 731 * is not found or a pointer to the device. The device returned has 732 * had a reference added and the pointer is safe until the user calls 733 * dev_put to indicate they have finished with it. 734 */ 735 736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) 737 { 738 struct net_device *dev, *ret; 739 740 ret = NULL; 741 read_lock(&dev_base_lock); 742 for_each_netdev(net, dev) { 743 if (((dev->flags ^ if_flags) & mask) == 0) { 744 dev_hold(dev); 745 ret = dev; 746 break; 747 } 748 } 749 read_unlock(&dev_base_lock); 750 return ret; 751 } 752 753 /** 754 * dev_valid_name - check if name is okay for network device 755 * @name: name string 756 * 757 * Network device names need to be valid file names to 758 * to allow sysfs to work. We also disallow any kind of 759 * whitespace. 760 */ 761 int dev_valid_name(const char *name) 762 { 763 if (*name == '\0') 764 return 0; 765 if (strlen(name) >= IFNAMSIZ) 766 return 0; 767 if (!strcmp(name, ".") || !strcmp(name, "..")) 768 return 0; 769 770 while (*name) { 771 if (*name == '/' || isspace(*name)) 772 return 0; 773 name++; 774 } 775 return 1; 776 } 777 778 /** 779 * __dev_alloc_name - allocate a name for a device 780 * @net: network namespace to allocate the device name in 781 * @name: name format string 782 * @buf: scratch buffer and result name string 783 * 784 * Passed a format string - eg "lt%d" it will try and find a suitable 785 * id. It scans list of devices to build up a free map, then chooses 786 * the first empty slot. The caller must hold the dev_base or rtnl lock 787 * while allocating the name and adding the device in order to avoid 788 * duplicates. 789 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 790 * Returns the number of the unit assigned or a negative errno code. 791 */ 792 793 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 794 { 795 int i = 0; 796 const char *p; 797 const int max_netdevices = 8*PAGE_SIZE; 798 unsigned long *inuse; 799 struct net_device *d; 800 801 p = strnchr(name, IFNAMSIZ-1, '%'); 802 if (p) { 803 /* 804 * Verify the string as this thing may have come from 805 * the user. There must be either one "%d" and no other "%" 806 * characters. 807 */ 808 if (p[1] != 'd' || strchr(p + 2, '%')) 809 return -EINVAL; 810 811 /* Use one page as a bit array of possible slots */ 812 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 813 if (!inuse) 814 return -ENOMEM; 815 816 for_each_netdev(net, d) { 817 if (!sscanf(d->name, name, &i)) 818 continue; 819 if (i < 0 || i >= max_netdevices) 820 continue; 821 822 /* avoid cases where sscanf is not exact inverse of printf */ 823 snprintf(buf, IFNAMSIZ, name, i); 824 if (!strncmp(buf, d->name, IFNAMSIZ)) 825 set_bit(i, inuse); 826 } 827 828 i = find_first_zero_bit(inuse, max_netdevices); 829 free_page((unsigned long) inuse); 830 } 831 832 snprintf(buf, IFNAMSIZ, name, i); 833 if (!__dev_get_by_name(net, buf)) 834 return i; 835 836 /* It is possible to run out of possible slots 837 * when the name is long and there isn't enough space left 838 * for the digits, or if all bits are used. 839 */ 840 return -ENFILE; 841 } 842 843 /** 844 * dev_alloc_name - allocate a name for a device 845 * @dev: device 846 * @name: name format string 847 * 848 * Passed a format string - eg "lt%d" it will try and find a suitable 849 * id. It scans list of devices to build up a free map, then chooses 850 * the first empty slot. The caller must hold the dev_base or rtnl lock 851 * while allocating the name and adding the device in order to avoid 852 * duplicates. 853 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 854 * Returns the number of the unit assigned or a negative errno code. 855 */ 856 857 int dev_alloc_name(struct net_device *dev, const char *name) 858 { 859 char buf[IFNAMSIZ]; 860 struct net *net; 861 int ret; 862 863 BUG_ON(!dev_net(dev)); 864 net = dev_net(dev); 865 ret = __dev_alloc_name(net, name, buf); 866 if (ret >= 0) 867 strlcpy(dev->name, buf, IFNAMSIZ); 868 return ret; 869 } 870 871 872 /** 873 * dev_change_name - change name of a device 874 * @dev: device 875 * @newname: name (or format string) must be at least IFNAMSIZ 876 * 877 * Change name of a device, can pass format strings "eth%d". 878 * for wildcarding. 879 */ 880 int dev_change_name(struct net_device *dev, const char *newname) 881 { 882 char oldname[IFNAMSIZ]; 883 int err = 0; 884 int ret; 885 struct net *net; 886 887 ASSERT_RTNL(); 888 BUG_ON(!dev_net(dev)); 889 890 net = dev_net(dev); 891 if (dev->flags & IFF_UP) 892 return -EBUSY; 893 894 if (!dev_valid_name(newname)) 895 return -EINVAL; 896 897 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) 898 return 0; 899 900 memcpy(oldname, dev->name, IFNAMSIZ); 901 902 if (strchr(newname, '%')) { 903 err = dev_alloc_name(dev, newname); 904 if (err < 0) 905 return err; 906 } 907 else if (__dev_get_by_name(net, newname)) 908 return -EEXIST; 909 else 910 strlcpy(dev->name, newname, IFNAMSIZ); 911 912 rollback: 913 /* For now only devices in the initial network namespace 914 * are in sysfs. 915 */ 916 if (net == &init_net) { 917 ret = device_rename(&dev->dev, dev->name); 918 if (ret) { 919 memcpy(dev->name, oldname, IFNAMSIZ); 920 return ret; 921 } 922 } 923 924 write_lock_bh(&dev_base_lock); 925 hlist_del(&dev->name_hlist); 926 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); 927 write_unlock_bh(&dev_base_lock); 928 929 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 930 ret = notifier_to_errno(ret); 931 932 if (ret) { 933 if (err) { 934 printk(KERN_ERR 935 "%s: name change rollback failed: %d.\n", 936 dev->name, ret); 937 } else { 938 err = ret; 939 memcpy(dev->name, oldname, IFNAMSIZ); 940 goto rollback; 941 } 942 } 943 944 return err; 945 } 946 947 /** 948 * dev_set_alias - change ifalias of a device 949 * @dev: device 950 * @alias: name up to IFALIASZ 951 * @len: limit of bytes to copy from info 952 * 953 * Set ifalias for a device, 954 */ 955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 956 { 957 ASSERT_RTNL(); 958 959 if (len >= IFALIASZ) 960 return -EINVAL; 961 962 if (!len) { 963 if (dev->ifalias) { 964 kfree(dev->ifalias); 965 dev->ifalias = NULL; 966 } 967 return 0; 968 } 969 970 dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL); 971 if (!dev->ifalias) 972 return -ENOMEM; 973 974 strlcpy(dev->ifalias, alias, len+1); 975 return len; 976 } 977 978 979 /** 980 * netdev_features_change - device changes features 981 * @dev: device to cause notification 982 * 983 * Called to indicate a device has changed features. 984 */ 985 void netdev_features_change(struct net_device *dev) 986 { 987 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 988 } 989 EXPORT_SYMBOL(netdev_features_change); 990 991 /** 992 * netdev_state_change - device changes state 993 * @dev: device to cause notification 994 * 995 * Called to indicate a device has changed state. This function calls 996 * the notifier chains for netdev_chain and sends a NEWLINK message 997 * to the routing socket. 998 */ 999 void netdev_state_change(struct net_device *dev) 1000 { 1001 if (dev->flags & IFF_UP) { 1002 call_netdevice_notifiers(NETDEV_CHANGE, dev); 1003 rtmsg_ifinfo(RTM_NEWLINK, dev, 0); 1004 } 1005 } 1006 1007 void netdev_bonding_change(struct net_device *dev) 1008 { 1009 call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev); 1010 } 1011 EXPORT_SYMBOL(netdev_bonding_change); 1012 1013 /** 1014 * dev_load - load a network module 1015 * @net: the applicable net namespace 1016 * @name: name of interface 1017 * 1018 * If a network interface is not present and the process has suitable 1019 * privileges this function loads the module. If module loading is not 1020 * available in this kernel then it becomes a nop. 1021 */ 1022 1023 void dev_load(struct net *net, const char *name) 1024 { 1025 struct net_device *dev; 1026 1027 read_lock(&dev_base_lock); 1028 dev = __dev_get_by_name(net, name); 1029 read_unlock(&dev_base_lock); 1030 1031 if (!dev && capable(CAP_SYS_MODULE)) 1032 request_module("%s", name); 1033 } 1034 1035 /** 1036 * dev_open - prepare an interface for use. 1037 * @dev: device to open 1038 * 1039 * Takes a device from down to up state. The device's private open 1040 * function is invoked and then the multicast lists are loaded. Finally 1041 * the device is moved into the up state and a %NETDEV_UP message is 1042 * sent to the netdev notifier chain. 1043 * 1044 * Calling this function on an active interface is a nop. On a failure 1045 * a negative errno code is returned. 1046 */ 1047 int dev_open(struct net_device *dev) 1048 { 1049 const struct net_device_ops *ops = dev->netdev_ops; 1050 int ret = 0; 1051 1052 ASSERT_RTNL(); 1053 1054 /* 1055 * Is it already up? 1056 */ 1057 1058 if (dev->flags & IFF_UP) 1059 return 0; 1060 1061 /* 1062 * Is it even present? 1063 */ 1064 if (!netif_device_present(dev)) 1065 return -ENODEV; 1066 1067 /* 1068 * Call device private open method 1069 */ 1070 set_bit(__LINK_STATE_START, &dev->state); 1071 1072 if (ops->ndo_validate_addr) 1073 ret = ops->ndo_validate_addr(dev); 1074 1075 if (!ret && ops->ndo_open) 1076 ret = ops->ndo_open(dev); 1077 1078 /* 1079 * If it went open OK then: 1080 */ 1081 1082 if (ret) 1083 clear_bit(__LINK_STATE_START, &dev->state); 1084 else { 1085 /* 1086 * Set the flags. 1087 */ 1088 dev->flags |= IFF_UP; 1089 1090 /* 1091 * Enable NET_DMA 1092 */ 1093 net_dmaengine_get(); 1094 1095 /* 1096 * Initialize multicasting status 1097 */ 1098 dev_set_rx_mode(dev); 1099 1100 /* 1101 * Wakeup transmit queue engine 1102 */ 1103 dev_activate(dev); 1104 1105 /* 1106 * ... and announce new interface. 1107 */ 1108 call_netdevice_notifiers(NETDEV_UP, dev); 1109 } 1110 1111 return ret; 1112 } 1113 1114 /** 1115 * dev_close - shutdown an interface. 1116 * @dev: device to shutdown 1117 * 1118 * This function moves an active device into down state. A 1119 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1120 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1121 * chain. 1122 */ 1123 int dev_close(struct net_device *dev) 1124 { 1125 const struct net_device_ops *ops = dev->netdev_ops; 1126 ASSERT_RTNL(); 1127 1128 might_sleep(); 1129 1130 if (!(dev->flags & IFF_UP)) 1131 return 0; 1132 1133 /* 1134 * Tell people we are going down, so that they can 1135 * prepare to death, when device is still operating. 1136 */ 1137 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1138 1139 clear_bit(__LINK_STATE_START, &dev->state); 1140 1141 /* Synchronize to scheduled poll. We cannot touch poll list, 1142 * it can be even on different cpu. So just clear netif_running(). 1143 * 1144 * dev->stop() will invoke napi_disable() on all of it's 1145 * napi_struct instances on this device. 1146 */ 1147 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 1148 1149 dev_deactivate(dev); 1150 1151 /* 1152 * Call the device specific close. This cannot fail. 1153 * Only if device is UP 1154 * 1155 * We allow it to be called even after a DETACH hot-plug 1156 * event. 1157 */ 1158 if (ops->ndo_stop) 1159 ops->ndo_stop(dev); 1160 1161 /* 1162 * Device is now down. 1163 */ 1164 1165 dev->flags &= ~IFF_UP; 1166 1167 /* 1168 * Tell people we are down 1169 */ 1170 call_netdevice_notifiers(NETDEV_DOWN, dev); 1171 1172 /* 1173 * Shutdown NET_DMA 1174 */ 1175 net_dmaengine_put(); 1176 1177 return 0; 1178 } 1179 1180 1181 /** 1182 * dev_disable_lro - disable Large Receive Offload on a device 1183 * @dev: device 1184 * 1185 * Disable Large Receive Offload (LRO) on a net device. Must be 1186 * called under RTNL. This is needed if received packets may be 1187 * forwarded to another interface. 1188 */ 1189 void dev_disable_lro(struct net_device *dev) 1190 { 1191 if (dev->ethtool_ops && dev->ethtool_ops->get_flags && 1192 dev->ethtool_ops->set_flags) { 1193 u32 flags = dev->ethtool_ops->get_flags(dev); 1194 if (flags & ETH_FLAG_LRO) { 1195 flags &= ~ETH_FLAG_LRO; 1196 dev->ethtool_ops->set_flags(dev, flags); 1197 } 1198 } 1199 WARN_ON(dev->features & NETIF_F_LRO); 1200 } 1201 EXPORT_SYMBOL(dev_disable_lro); 1202 1203 1204 static int dev_boot_phase = 1; 1205 1206 /* 1207 * Device change register/unregister. These are not inline or static 1208 * as we export them to the world. 1209 */ 1210 1211 /** 1212 * register_netdevice_notifier - register a network notifier block 1213 * @nb: notifier 1214 * 1215 * Register a notifier to be called when network device events occur. 1216 * The notifier passed is linked into the kernel structures and must 1217 * not be reused until it has been unregistered. A negative errno code 1218 * is returned on a failure. 1219 * 1220 * When registered all registration and up events are replayed 1221 * to the new notifier to allow device to have a race free 1222 * view of the network device list. 1223 */ 1224 1225 int register_netdevice_notifier(struct notifier_block *nb) 1226 { 1227 struct net_device *dev; 1228 struct net_device *last; 1229 struct net *net; 1230 int err; 1231 1232 rtnl_lock(); 1233 err = raw_notifier_chain_register(&netdev_chain, nb); 1234 if (err) 1235 goto unlock; 1236 if (dev_boot_phase) 1237 goto unlock; 1238 for_each_net(net) { 1239 for_each_netdev(net, dev) { 1240 err = nb->notifier_call(nb, NETDEV_REGISTER, dev); 1241 err = notifier_to_errno(err); 1242 if (err) 1243 goto rollback; 1244 1245 if (!(dev->flags & IFF_UP)) 1246 continue; 1247 1248 nb->notifier_call(nb, NETDEV_UP, dev); 1249 } 1250 } 1251 1252 unlock: 1253 rtnl_unlock(); 1254 return err; 1255 1256 rollback: 1257 last = dev; 1258 for_each_net(net) { 1259 for_each_netdev(net, dev) { 1260 if (dev == last) 1261 break; 1262 1263 if (dev->flags & IFF_UP) { 1264 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); 1265 nb->notifier_call(nb, NETDEV_DOWN, dev); 1266 } 1267 nb->notifier_call(nb, NETDEV_UNREGISTER, dev); 1268 } 1269 } 1270 1271 raw_notifier_chain_unregister(&netdev_chain, nb); 1272 goto unlock; 1273 } 1274 1275 /** 1276 * unregister_netdevice_notifier - unregister a network notifier block 1277 * @nb: notifier 1278 * 1279 * Unregister a notifier previously registered by 1280 * register_netdevice_notifier(). The notifier is unlinked into the 1281 * kernel structures and may then be reused. A negative errno code 1282 * is returned on a failure. 1283 */ 1284 1285 int unregister_netdevice_notifier(struct notifier_block *nb) 1286 { 1287 int err; 1288 1289 rtnl_lock(); 1290 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1291 rtnl_unlock(); 1292 return err; 1293 } 1294 1295 /** 1296 * call_netdevice_notifiers - call all network notifier blocks 1297 * @val: value passed unmodified to notifier function 1298 * @dev: net_device pointer passed unmodified to notifier function 1299 * 1300 * Call all network notifier blocks. Parameters and return value 1301 * are as for raw_notifier_call_chain(). 1302 */ 1303 1304 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1305 { 1306 return raw_notifier_call_chain(&netdev_chain, val, dev); 1307 } 1308 1309 /* When > 0 there are consumers of rx skb time stamps */ 1310 static atomic_t netstamp_needed = ATOMIC_INIT(0); 1311 1312 void net_enable_timestamp(void) 1313 { 1314 atomic_inc(&netstamp_needed); 1315 } 1316 1317 void net_disable_timestamp(void) 1318 { 1319 atomic_dec(&netstamp_needed); 1320 } 1321 1322 static inline void net_timestamp(struct sk_buff *skb) 1323 { 1324 if (atomic_read(&netstamp_needed)) 1325 __net_timestamp(skb); 1326 else 1327 skb->tstamp.tv64 = 0; 1328 } 1329 1330 /* 1331 * Support routine. Sends outgoing frames to any network 1332 * taps currently in use. 1333 */ 1334 1335 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1336 { 1337 struct packet_type *ptype; 1338 1339 net_timestamp(skb); 1340 1341 rcu_read_lock(); 1342 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1343 /* Never send packets back to the socket 1344 * they originated from - MvS (miquels@drinkel.ow.org) 1345 */ 1346 if ((ptype->dev == dev || !ptype->dev) && 1347 (ptype->af_packet_priv == NULL || 1348 (struct sock *)ptype->af_packet_priv != skb->sk)) { 1349 struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); 1350 if (!skb2) 1351 break; 1352 1353 /* skb->nh should be correctly 1354 set by sender, so that the second statement is 1355 just protection against buggy protocols. 1356 */ 1357 skb_reset_mac_header(skb2); 1358 1359 if (skb_network_header(skb2) < skb2->data || 1360 skb2->network_header > skb2->tail) { 1361 if (net_ratelimit()) 1362 printk(KERN_CRIT "protocol %04x is " 1363 "buggy, dev %s\n", 1364 skb2->protocol, dev->name); 1365 skb_reset_network_header(skb2); 1366 } 1367 1368 skb2->transport_header = skb2->network_header; 1369 skb2->pkt_type = PACKET_OUTGOING; 1370 ptype->func(skb2, skb->dev, ptype, skb->dev); 1371 } 1372 } 1373 rcu_read_unlock(); 1374 } 1375 1376 1377 static inline void __netif_reschedule(struct Qdisc *q) 1378 { 1379 struct softnet_data *sd; 1380 unsigned long flags; 1381 1382 local_irq_save(flags); 1383 sd = &__get_cpu_var(softnet_data); 1384 q->next_sched = sd->output_queue; 1385 sd->output_queue = q; 1386 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1387 local_irq_restore(flags); 1388 } 1389 1390 void __netif_schedule(struct Qdisc *q) 1391 { 1392 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 1393 __netif_reschedule(q); 1394 } 1395 EXPORT_SYMBOL(__netif_schedule); 1396 1397 void dev_kfree_skb_irq(struct sk_buff *skb) 1398 { 1399 if (atomic_dec_and_test(&skb->users)) { 1400 struct softnet_data *sd; 1401 unsigned long flags; 1402 1403 local_irq_save(flags); 1404 sd = &__get_cpu_var(softnet_data); 1405 skb->next = sd->completion_queue; 1406 sd->completion_queue = skb; 1407 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1408 local_irq_restore(flags); 1409 } 1410 } 1411 EXPORT_SYMBOL(dev_kfree_skb_irq); 1412 1413 void dev_kfree_skb_any(struct sk_buff *skb) 1414 { 1415 if (in_irq() || irqs_disabled()) 1416 dev_kfree_skb_irq(skb); 1417 else 1418 dev_kfree_skb(skb); 1419 } 1420 EXPORT_SYMBOL(dev_kfree_skb_any); 1421 1422 1423 /** 1424 * netif_device_detach - mark device as removed 1425 * @dev: network device 1426 * 1427 * Mark device as removed from system and therefore no longer available. 1428 */ 1429 void netif_device_detach(struct net_device *dev) 1430 { 1431 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 1432 netif_running(dev)) { 1433 netif_stop_queue(dev); 1434 } 1435 } 1436 EXPORT_SYMBOL(netif_device_detach); 1437 1438 /** 1439 * netif_device_attach - mark device as attached 1440 * @dev: network device 1441 * 1442 * Mark device as attached from system and restart if needed. 1443 */ 1444 void netif_device_attach(struct net_device *dev) 1445 { 1446 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 1447 netif_running(dev)) { 1448 netif_wake_queue(dev); 1449 __netdev_watchdog_up(dev); 1450 } 1451 } 1452 EXPORT_SYMBOL(netif_device_attach); 1453 1454 static bool can_checksum_protocol(unsigned long features, __be16 protocol) 1455 { 1456 return ((features & NETIF_F_GEN_CSUM) || 1457 ((features & NETIF_F_IP_CSUM) && 1458 protocol == htons(ETH_P_IP)) || 1459 ((features & NETIF_F_IPV6_CSUM) && 1460 protocol == htons(ETH_P_IPV6))); 1461 } 1462 1463 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) 1464 { 1465 if (can_checksum_protocol(dev->features, skb->protocol)) 1466 return true; 1467 1468 if (skb->protocol == htons(ETH_P_8021Q)) { 1469 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 1470 if (can_checksum_protocol(dev->features & dev->vlan_features, 1471 veh->h_vlan_encapsulated_proto)) 1472 return true; 1473 } 1474 1475 return false; 1476 } 1477 1478 /* 1479 * Invalidate hardware checksum when packet is to be mangled, and 1480 * complete checksum manually on outgoing path. 1481 */ 1482 int skb_checksum_help(struct sk_buff *skb) 1483 { 1484 __wsum csum; 1485 int ret = 0, offset; 1486 1487 if (skb->ip_summed == CHECKSUM_COMPLETE) 1488 goto out_set_summed; 1489 1490 if (unlikely(skb_shinfo(skb)->gso_size)) { 1491 /* Let GSO fix up the checksum. */ 1492 goto out_set_summed; 1493 } 1494 1495 offset = skb->csum_start - skb_headroom(skb); 1496 BUG_ON(offset >= skb_headlen(skb)); 1497 csum = skb_checksum(skb, offset, skb->len - offset, 0); 1498 1499 offset += skb->csum_offset; 1500 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 1501 1502 if (skb_cloned(skb) && 1503 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 1504 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 1505 if (ret) 1506 goto out; 1507 } 1508 1509 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 1510 out_set_summed: 1511 skb->ip_summed = CHECKSUM_NONE; 1512 out: 1513 return ret; 1514 } 1515 1516 /** 1517 * skb_gso_segment - Perform segmentation on skb. 1518 * @skb: buffer to segment 1519 * @features: features for the output path (see dev->features) 1520 * 1521 * This function segments the given skb and returns a list of segments. 1522 * 1523 * It may return NULL if the skb requires no segmentation. This is 1524 * only possible when GSO is used for verifying header integrity. 1525 */ 1526 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) 1527 { 1528 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1529 struct packet_type *ptype; 1530 __be16 type = skb->protocol; 1531 int err; 1532 1533 skb_reset_mac_header(skb); 1534 skb->mac_len = skb->network_header - skb->mac_header; 1535 __skb_pull(skb, skb->mac_len); 1536 1537 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1538 struct net_device *dev = skb->dev; 1539 struct ethtool_drvinfo info = {}; 1540 1541 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) 1542 dev->ethtool_ops->get_drvinfo(dev, &info); 1543 1544 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " 1545 "ip_summed=%d", 1546 info.driver, dev ? dev->features : 0L, 1547 skb->sk ? skb->sk->sk_route_caps : 0L, 1548 skb->len, skb->data_len, skb->ip_summed); 1549 1550 if (skb_header_cloned(skb) && 1551 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 1552 return ERR_PTR(err); 1553 } 1554 1555 rcu_read_lock(); 1556 list_for_each_entry_rcu(ptype, 1557 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 1558 if (ptype->type == type && !ptype->dev && ptype->gso_segment) { 1559 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1560 err = ptype->gso_send_check(skb); 1561 segs = ERR_PTR(err); 1562 if (err || skb_gso_ok(skb, features)) 1563 break; 1564 __skb_push(skb, (skb->data - 1565 skb_network_header(skb))); 1566 } 1567 segs = ptype->gso_segment(skb, features); 1568 break; 1569 } 1570 } 1571 rcu_read_unlock(); 1572 1573 __skb_push(skb, skb->data - skb_mac_header(skb)); 1574 1575 return segs; 1576 } 1577 1578 EXPORT_SYMBOL(skb_gso_segment); 1579 1580 /* Take action when hardware reception checksum errors are detected. */ 1581 #ifdef CONFIG_BUG 1582 void netdev_rx_csum_fault(struct net_device *dev) 1583 { 1584 if (net_ratelimit()) { 1585 printk(KERN_ERR "%s: hw csum failure.\n", 1586 dev ? dev->name : "<unknown>"); 1587 dump_stack(); 1588 } 1589 } 1590 EXPORT_SYMBOL(netdev_rx_csum_fault); 1591 #endif 1592 1593 /* Actually, we should eliminate this check as soon as we know, that: 1594 * 1. IOMMU is present and allows to map all the memory. 1595 * 2. No high memory really exists on this machine. 1596 */ 1597 1598 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 1599 { 1600 #ifdef CONFIG_HIGHMEM 1601 int i; 1602 1603 if (dev->features & NETIF_F_HIGHDMA) 1604 return 0; 1605 1606 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1607 if (PageHighMem(skb_shinfo(skb)->frags[i].page)) 1608 return 1; 1609 1610 #endif 1611 return 0; 1612 } 1613 1614 struct dev_gso_cb { 1615 void (*destructor)(struct sk_buff *skb); 1616 }; 1617 1618 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) 1619 1620 static void dev_gso_skb_destructor(struct sk_buff *skb) 1621 { 1622 struct dev_gso_cb *cb; 1623 1624 do { 1625 struct sk_buff *nskb = skb->next; 1626 1627 skb->next = nskb->next; 1628 nskb->next = NULL; 1629 kfree_skb(nskb); 1630 } while (skb->next); 1631 1632 cb = DEV_GSO_CB(skb); 1633 if (cb->destructor) 1634 cb->destructor(skb); 1635 } 1636 1637 /** 1638 * dev_gso_segment - Perform emulated hardware segmentation on skb. 1639 * @skb: buffer to segment 1640 * 1641 * This function segments the given skb and stores the list of segments 1642 * in skb->next. 1643 */ 1644 static int dev_gso_segment(struct sk_buff *skb) 1645 { 1646 struct net_device *dev = skb->dev; 1647 struct sk_buff *segs; 1648 int features = dev->features & ~(illegal_highdma(dev, skb) ? 1649 NETIF_F_SG : 0); 1650 1651 segs = skb_gso_segment(skb, features); 1652 1653 /* Verifying header integrity only. */ 1654 if (!segs) 1655 return 0; 1656 1657 if (IS_ERR(segs)) 1658 return PTR_ERR(segs); 1659 1660 skb->next = segs; 1661 DEV_GSO_CB(skb)->destructor = skb->destructor; 1662 skb->destructor = dev_gso_skb_destructor; 1663 1664 return 0; 1665 } 1666 1667 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 1668 struct netdev_queue *txq) 1669 { 1670 const struct net_device_ops *ops = dev->netdev_ops; 1671 int rc; 1672 1673 if (likely(!skb->next)) { 1674 if (!list_empty(&ptype_all)) 1675 dev_queue_xmit_nit(skb, dev); 1676 1677 if (netif_needs_gso(dev, skb)) { 1678 if (unlikely(dev_gso_segment(skb))) 1679 goto out_kfree_skb; 1680 if (skb->next) 1681 goto gso; 1682 } 1683 1684 rc = ops->ndo_start_xmit(skb, dev); 1685 /* 1686 * TODO: if skb_orphan() was called by 1687 * dev->hard_start_xmit() (for example, the unmodified 1688 * igb driver does that; bnx2 doesn't), then 1689 * skb_tx_software_timestamp() will be unable to send 1690 * back the time stamp. 1691 * 1692 * How can this be prevented? Always create another 1693 * reference to the socket before calling 1694 * dev->hard_start_xmit()? Prevent that skb_orphan() 1695 * does anything in dev->hard_start_xmit() by clearing 1696 * the skb destructor before the call and restoring it 1697 * afterwards, then doing the skb_orphan() ourselves? 1698 */ 1699 return rc; 1700 } 1701 1702 gso: 1703 do { 1704 struct sk_buff *nskb = skb->next; 1705 1706 skb->next = nskb->next; 1707 nskb->next = NULL; 1708 rc = ops->ndo_start_xmit(nskb, dev); 1709 if (unlikely(rc)) { 1710 nskb->next = skb->next; 1711 skb->next = nskb; 1712 return rc; 1713 } 1714 if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) 1715 return NETDEV_TX_BUSY; 1716 } while (skb->next); 1717 1718 skb->destructor = DEV_GSO_CB(skb)->destructor; 1719 1720 out_kfree_skb: 1721 kfree_skb(skb); 1722 return 0; 1723 } 1724 1725 static u32 skb_tx_hashrnd; 1726 1727 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 1728 { 1729 u32 hash; 1730 1731 if (skb_rx_queue_recorded(skb)) { 1732 hash = skb_get_rx_queue(skb); 1733 } else if (skb->sk && skb->sk->sk_hash) { 1734 hash = skb->sk->sk_hash; 1735 } else 1736 hash = skb->protocol; 1737 1738 hash = jhash_1word(hash, skb_tx_hashrnd); 1739 1740 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 1741 } 1742 EXPORT_SYMBOL(skb_tx_hash); 1743 1744 static struct netdev_queue *dev_pick_tx(struct net_device *dev, 1745 struct sk_buff *skb) 1746 { 1747 const struct net_device_ops *ops = dev->netdev_ops; 1748 u16 queue_index = 0; 1749 1750 if (ops->ndo_select_queue) 1751 queue_index = ops->ndo_select_queue(dev, skb); 1752 else if (dev->real_num_tx_queues > 1) 1753 queue_index = skb_tx_hash(dev, skb); 1754 1755 skb_set_queue_mapping(skb, queue_index); 1756 return netdev_get_tx_queue(dev, queue_index); 1757 } 1758 1759 /** 1760 * dev_queue_xmit - transmit a buffer 1761 * @skb: buffer to transmit 1762 * 1763 * Queue a buffer for transmission to a network device. The caller must 1764 * have set the device and priority and built the buffer before calling 1765 * this function. The function can be called from an interrupt. 1766 * 1767 * A negative errno code is returned on a failure. A success does not 1768 * guarantee the frame will be transmitted as it may be dropped due 1769 * to congestion or traffic shaping. 1770 * 1771 * ----------------------------------------------------------------------------------- 1772 * I notice this method can also return errors from the queue disciplines, 1773 * including NET_XMIT_DROP, which is a positive value. So, errors can also 1774 * be positive. 1775 * 1776 * Regardless of the return value, the skb is consumed, so it is currently 1777 * difficult to retry a send to this method. (You can bump the ref count 1778 * before sending to hold a reference for retry if you are careful.) 1779 * 1780 * When calling this method, interrupts MUST be enabled. This is because 1781 * the BH enable code must have IRQs enabled so that it will not deadlock. 1782 * --BLG 1783 */ 1784 int dev_queue_xmit(struct sk_buff *skb) 1785 { 1786 struct net_device *dev = skb->dev; 1787 struct netdev_queue *txq; 1788 struct Qdisc *q; 1789 int rc = -ENOMEM; 1790 1791 /* GSO will handle the following emulations directly. */ 1792 if (netif_needs_gso(dev, skb)) 1793 goto gso; 1794 1795 if (skb_shinfo(skb)->frag_list && 1796 !(dev->features & NETIF_F_FRAGLIST) && 1797 __skb_linearize(skb)) 1798 goto out_kfree_skb; 1799 1800 /* Fragmented skb is linearized if device does not support SG, 1801 * or if at least one of fragments is in highmem and device 1802 * does not support DMA from it. 1803 */ 1804 if (skb_shinfo(skb)->nr_frags && 1805 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && 1806 __skb_linearize(skb)) 1807 goto out_kfree_skb; 1808 1809 /* If packet is not checksummed and device does not support 1810 * checksumming for this protocol, complete checksumming here. 1811 */ 1812 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1813 skb_set_transport_header(skb, skb->csum_start - 1814 skb_headroom(skb)); 1815 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb)) 1816 goto out_kfree_skb; 1817 } 1818 1819 gso: 1820 /* Disable soft irqs for various locks below. Also 1821 * stops preemption for RCU. 1822 */ 1823 rcu_read_lock_bh(); 1824 1825 txq = dev_pick_tx(dev, skb); 1826 q = rcu_dereference(txq->qdisc); 1827 1828 #ifdef CONFIG_NET_CLS_ACT 1829 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); 1830 #endif 1831 if (q->enqueue) { 1832 spinlock_t *root_lock = qdisc_lock(q); 1833 1834 spin_lock(root_lock); 1835 1836 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 1837 kfree_skb(skb); 1838 rc = NET_XMIT_DROP; 1839 } else { 1840 rc = qdisc_enqueue_root(skb, q); 1841 qdisc_run(q); 1842 } 1843 spin_unlock(root_lock); 1844 1845 goto out; 1846 } 1847 1848 /* The device has no queue. Common case for software devices: 1849 loopback, all the sorts of tunnels... 1850 1851 Really, it is unlikely that netif_tx_lock protection is necessary 1852 here. (f.e. loopback and IP tunnels are clean ignoring statistics 1853 counters.) 1854 However, it is possible, that they rely on protection 1855 made by us here. 1856 1857 Check this and shot the lock. It is not prone from deadlocks. 1858 Either shot noqueue qdisc, it is even simpler 8) 1859 */ 1860 if (dev->flags & IFF_UP) { 1861 int cpu = smp_processor_id(); /* ok because BHs are off */ 1862 1863 if (txq->xmit_lock_owner != cpu) { 1864 1865 HARD_TX_LOCK(dev, txq, cpu); 1866 1867 if (!netif_tx_queue_stopped(txq)) { 1868 rc = 0; 1869 if (!dev_hard_start_xmit(skb, dev, txq)) { 1870 HARD_TX_UNLOCK(dev, txq); 1871 goto out; 1872 } 1873 } 1874 HARD_TX_UNLOCK(dev, txq); 1875 if (net_ratelimit()) 1876 printk(KERN_CRIT "Virtual device %s asks to " 1877 "queue packet!\n", dev->name); 1878 } else { 1879 /* Recursion is detected! It is possible, 1880 * unfortunately */ 1881 if (net_ratelimit()) 1882 printk(KERN_CRIT "Dead loop on virtual device " 1883 "%s, fix it urgently!\n", dev->name); 1884 } 1885 } 1886 1887 rc = -ENETDOWN; 1888 rcu_read_unlock_bh(); 1889 1890 out_kfree_skb: 1891 kfree_skb(skb); 1892 return rc; 1893 out: 1894 rcu_read_unlock_bh(); 1895 return rc; 1896 } 1897 1898 1899 /*======================================================================= 1900 Receiver routines 1901 =======================================================================*/ 1902 1903 int netdev_max_backlog __read_mostly = 1000; 1904 int netdev_budget __read_mostly = 300; 1905 int weight_p __read_mostly = 64; /* old backlog weight */ 1906 1907 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 1908 1909 1910 /** 1911 * netif_rx - post buffer to the network code 1912 * @skb: buffer to post 1913 * 1914 * This function receives a packet from a device driver and queues it for 1915 * the upper (protocol) levels to process. It always succeeds. The buffer 1916 * may be dropped during processing for congestion control or by the 1917 * protocol layers. 1918 * 1919 * return values: 1920 * NET_RX_SUCCESS (no congestion) 1921 * NET_RX_DROP (packet was dropped) 1922 * 1923 */ 1924 1925 int netif_rx(struct sk_buff *skb) 1926 { 1927 struct softnet_data *queue; 1928 unsigned long flags; 1929 1930 /* if netpoll wants it, pretend we never saw it */ 1931 if (netpoll_rx(skb)) 1932 return NET_RX_DROP; 1933 1934 if (!skb->tstamp.tv64) 1935 net_timestamp(skb); 1936 1937 /* 1938 * The code is rearranged so that the path is the most 1939 * short when CPU is congested, but is still operating. 1940 */ 1941 local_irq_save(flags); 1942 queue = &__get_cpu_var(softnet_data); 1943 1944 __get_cpu_var(netdev_rx_stat).total++; 1945 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 1946 if (queue->input_pkt_queue.qlen) { 1947 enqueue: 1948 __skb_queue_tail(&queue->input_pkt_queue, skb); 1949 local_irq_restore(flags); 1950 return NET_RX_SUCCESS; 1951 } 1952 1953 napi_schedule(&queue->backlog); 1954 goto enqueue; 1955 } 1956 1957 __get_cpu_var(netdev_rx_stat).dropped++; 1958 local_irq_restore(flags); 1959 1960 kfree_skb(skb); 1961 return NET_RX_DROP; 1962 } 1963 1964 int netif_rx_ni(struct sk_buff *skb) 1965 { 1966 int err; 1967 1968 preempt_disable(); 1969 err = netif_rx(skb); 1970 if (local_softirq_pending()) 1971 do_softirq(); 1972 preempt_enable(); 1973 1974 return err; 1975 } 1976 1977 EXPORT_SYMBOL(netif_rx_ni); 1978 1979 static void net_tx_action(struct softirq_action *h) 1980 { 1981 struct softnet_data *sd = &__get_cpu_var(softnet_data); 1982 1983 if (sd->completion_queue) { 1984 struct sk_buff *clist; 1985 1986 local_irq_disable(); 1987 clist = sd->completion_queue; 1988 sd->completion_queue = NULL; 1989 local_irq_enable(); 1990 1991 while (clist) { 1992 struct sk_buff *skb = clist; 1993 clist = clist->next; 1994 1995 WARN_ON(atomic_read(&skb->users)); 1996 __kfree_skb(skb); 1997 } 1998 } 1999 2000 if (sd->output_queue) { 2001 struct Qdisc *head; 2002 2003 local_irq_disable(); 2004 head = sd->output_queue; 2005 sd->output_queue = NULL; 2006 local_irq_enable(); 2007 2008 while (head) { 2009 struct Qdisc *q = head; 2010 spinlock_t *root_lock; 2011 2012 head = head->next_sched; 2013 2014 root_lock = qdisc_lock(q); 2015 if (spin_trylock(root_lock)) { 2016 smp_mb__before_clear_bit(); 2017 clear_bit(__QDISC_STATE_SCHED, 2018 &q->state); 2019 qdisc_run(q); 2020 spin_unlock(root_lock); 2021 } else { 2022 if (!test_bit(__QDISC_STATE_DEACTIVATED, 2023 &q->state)) { 2024 __netif_reschedule(q); 2025 } else { 2026 smp_mb__before_clear_bit(); 2027 clear_bit(__QDISC_STATE_SCHED, 2028 &q->state); 2029 } 2030 } 2031 } 2032 } 2033 } 2034 2035 static inline int deliver_skb(struct sk_buff *skb, 2036 struct packet_type *pt_prev, 2037 struct net_device *orig_dev) 2038 { 2039 atomic_inc(&skb->users); 2040 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2041 } 2042 2043 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) 2044 /* These hooks defined here for ATM */ 2045 struct net_bridge; 2046 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, 2047 unsigned char *addr); 2048 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; 2049 2050 /* 2051 * If bridge module is loaded call bridging hook. 2052 * returns NULL if packet was consumed. 2053 */ 2054 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, 2055 struct sk_buff *skb) __read_mostly; 2056 static inline struct sk_buff *handle_bridge(struct sk_buff *skb, 2057 struct packet_type **pt_prev, int *ret, 2058 struct net_device *orig_dev) 2059 { 2060 struct net_bridge_port *port; 2061 2062 if (skb->pkt_type == PACKET_LOOPBACK || 2063 (port = rcu_dereference(skb->dev->br_port)) == NULL) 2064 return skb; 2065 2066 if (*pt_prev) { 2067 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2068 *pt_prev = NULL; 2069 } 2070 2071 return br_handle_frame_hook(port, skb); 2072 } 2073 #else 2074 #define handle_bridge(skb, pt_prev, ret, orig_dev) (skb) 2075 #endif 2076 2077 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) 2078 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; 2079 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); 2080 2081 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, 2082 struct packet_type **pt_prev, 2083 int *ret, 2084 struct net_device *orig_dev) 2085 { 2086 if (skb->dev->macvlan_port == NULL) 2087 return skb; 2088 2089 if (*pt_prev) { 2090 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2091 *pt_prev = NULL; 2092 } 2093 return macvlan_handle_frame_hook(skb); 2094 } 2095 #else 2096 #define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) 2097 #endif 2098 2099 #ifdef CONFIG_NET_CLS_ACT 2100 /* TODO: Maybe we should just force sch_ingress to be compiled in 2101 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 2102 * a compare and 2 stores extra right now if we dont have it on 2103 * but have CONFIG_NET_CLS_ACT 2104 * NOTE: This doesnt stop any functionality; if you dont have 2105 * the ingress scheduler, you just cant add policies on ingress. 2106 * 2107 */ 2108 static int ing_filter(struct sk_buff *skb) 2109 { 2110 struct net_device *dev = skb->dev; 2111 u32 ttl = G_TC_RTTL(skb->tc_verd); 2112 struct netdev_queue *rxq; 2113 int result = TC_ACT_OK; 2114 struct Qdisc *q; 2115 2116 if (MAX_RED_LOOP < ttl++) { 2117 printk(KERN_WARNING 2118 "Redir loop detected Dropping packet (%d->%d)\n", 2119 skb->iif, dev->ifindex); 2120 return TC_ACT_SHOT; 2121 } 2122 2123 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 2124 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 2125 2126 rxq = &dev->rx_queue; 2127 2128 q = rxq->qdisc; 2129 if (q != &noop_qdisc) { 2130 spin_lock(qdisc_lock(q)); 2131 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 2132 result = qdisc_enqueue_root(skb, q); 2133 spin_unlock(qdisc_lock(q)); 2134 } 2135 2136 return result; 2137 } 2138 2139 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 2140 struct packet_type **pt_prev, 2141 int *ret, struct net_device *orig_dev) 2142 { 2143 if (skb->dev->rx_queue.qdisc == &noop_qdisc) 2144 goto out; 2145 2146 if (*pt_prev) { 2147 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2148 *pt_prev = NULL; 2149 } else { 2150 /* Huh? Why does turning on AF_PACKET affect this? */ 2151 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); 2152 } 2153 2154 switch (ing_filter(skb)) { 2155 case TC_ACT_SHOT: 2156 case TC_ACT_STOLEN: 2157 kfree_skb(skb); 2158 return NULL; 2159 } 2160 2161 out: 2162 skb->tc_verd = 0; 2163 return skb; 2164 } 2165 #endif 2166 2167 /* 2168 * netif_nit_deliver - deliver received packets to network taps 2169 * @skb: buffer 2170 * 2171 * This function is used to deliver incoming packets to network 2172 * taps. It should be used when the normal netif_receive_skb path 2173 * is bypassed, for example because of VLAN acceleration. 2174 */ 2175 void netif_nit_deliver(struct sk_buff *skb) 2176 { 2177 struct packet_type *ptype; 2178 2179 if (list_empty(&ptype_all)) 2180 return; 2181 2182 skb_reset_network_header(skb); 2183 skb_reset_transport_header(skb); 2184 skb->mac_len = skb->network_header - skb->mac_header; 2185 2186 rcu_read_lock(); 2187 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2188 if (!ptype->dev || ptype->dev == skb->dev) 2189 deliver_skb(skb, ptype, skb->dev); 2190 } 2191 rcu_read_unlock(); 2192 } 2193 2194 /** 2195 * netif_receive_skb - process receive buffer from network 2196 * @skb: buffer to process 2197 * 2198 * netif_receive_skb() is the main receive data processing function. 2199 * It always succeeds. The buffer may be dropped during processing 2200 * for congestion control or by the protocol layers. 2201 * 2202 * This function may only be called from softirq context and interrupts 2203 * should be enabled. 2204 * 2205 * Return values (usually ignored): 2206 * NET_RX_SUCCESS: no congestion 2207 * NET_RX_DROP: packet was dropped 2208 */ 2209 int netif_receive_skb(struct sk_buff *skb) 2210 { 2211 struct packet_type *ptype, *pt_prev; 2212 struct net_device *orig_dev; 2213 struct net_device *null_or_orig; 2214 int ret = NET_RX_DROP; 2215 __be16 type; 2216 2217 if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) 2218 return NET_RX_SUCCESS; 2219 2220 /* if we've gotten here through NAPI, check netpoll */ 2221 if (netpoll_receive_skb(skb)) 2222 return NET_RX_DROP; 2223 2224 if (!skb->tstamp.tv64) 2225 net_timestamp(skb); 2226 2227 if (!skb->iif) 2228 skb->iif = skb->dev->ifindex; 2229 2230 null_or_orig = NULL; 2231 orig_dev = skb->dev; 2232 if (orig_dev->master) { 2233 if (skb_bond_should_drop(skb)) 2234 null_or_orig = orig_dev; /* deliver only exact match */ 2235 else 2236 skb->dev = orig_dev->master; 2237 } 2238 2239 __get_cpu_var(netdev_rx_stat).total++; 2240 2241 skb_reset_network_header(skb); 2242 skb_reset_transport_header(skb); 2243 skb->mac_len = skb->network_header - skb->mac_header; 2244 2245 pt_prev = NULL; 2246 2247 rcu_read_lock(); 2248 2249 #ifdef CONFIG_NET_CLS_ACT 2250 if (skb->tc_verd & TC_NCLS) { 2251 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 2252 goto ncls; 2253 } 2254 #endif 2255 2256 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2257 if (ptype->dev == null_or_orig || ptype->dev == skb->dev || 2258 ptype->dev == orig_dev) { 2259 if (pt_prev) 2260 ret = deliver_skb(skb, pt_prev, orig_dev); 2261 pt_prev = ptype; 2262 } 2263 } 2264 2265 #ifdef CONFIG_NET_CLS_ACT 2266 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 2267 if (!skb) 2268 goto out; 2269 ncls: 2270 #endif 2271 2272 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); 2273 if (!skb) 2274 goto out; 2275 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); 2276 if (!skb) 2277 goto out; 2278 2279 skb_orphan(skb); 2280 2281 type = skb->protocol; 2282 list_for_each_entry_rcu(ptype, 2283 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 2284 if (ptype->type == type && 2285 (ptype->dev == null_or_orig || ptype->dev == skb->dev || 2286 ptype->dev == orig_dev)) { 2287 if (pt_prev) 2288 ret = deliver_skb(skb, pt_prev, orig_dev); 2289 pt_prev = ptype; 2290 } 2291 } 2292 2293 if (pt_prev) { 2294 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2295 } else { 2296 kfree_skb(skb); 2297 /* Jamal, now you will not able to escape explaining 2298 * me how you were going to use this. :-) 2299 */ 2300 ret = NET_RX_DROP; 2301 } 2302 2303 out: 2304 rcu_read_unlock(); 2305 return ret; 2306 } 2307 2308 /* Network device is going away, flush any packets still pending */ 2309 static void flush_backlog(void *arg) 2310 { 2311 struct net_device *dev = arg; 2312 struct softnet_data *queue = &__get_cpu_var(softnet_data); 2313 struct sk_buff *skb, *tmp; 2314 2315 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) 2316 if (skb->dev == dev) { 2317 __skb_unlink(skb, &queue->input_pkt_queue); 2318 kfree_skb(skb); 2319 } 2320 } 2321 2322 static int napi_gro_complete(struct sk_buff *skb) 2323 { 2324 struct packet_type *ptype; 2325 __be16 type = skb->protocol; 2326 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 2327 int err = -ENOENT; 2328 2329 if (NAPI_GRO_CB(skb)->count == 1) 2330 goto out; 2331 2332 rcu_read_lock(); 2333 list_for_each_entry_rcu(ptype, head, list) { 2334 if (ptype->type != type || ptype->dev || !ptype->gro_complete) 2335 continue; 2336 2337 err = ptype->gro_complete(skb); 2338 break; 2339 } 2340 rcu_read_unlock(); 2341 2342 if (err) { 2343 WARN_ON(&ptype->list == head); 2344 kfree_skb(skb); 2345 return NET_RX_SUCCESS; 2346 } 2347 2348 out: 2349 skb_shinfo(skb)->gso_size = 0; 2350 return netif_receive_skb(skb); 2351 } 2352 2353 void napi_gro_flush(struct napi_struct *napi) 2354 { 2355 struct sk_buff *skb, *next; 2356 2357 for (skb = napi->gro_list; skb; skb = next) { 2358 next = skb->next; 2359 skb->next = NULL; 2360 napi_gro_complete(skb); 2361 } 2362 2363 napi->gro_count = 0; 2364 napi->gro_list = NULL; 2365 } 2366 EXPORT_SYMBOL(napi_gro_flush); 2367 2368 void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) 2369 { 2370 unsigned int offset = skb_gro_offset(skb); 2371 2372 hlen += offset; 2373 if (hlen <= skb_headlen(skb)) 2374 return skb->data + offset; 2375 2376 if (unlikely(!skb_shinfo(skb)->nr_frags || 2377 skb_shinfo(skb)->frags[0].size <= 2378 hlen - skb_headlen(skb) || 2379 PageHighMem(skb_shinfo(skb)->frags[0].page))) 2380 return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; 2381 2382 return page_address(skb_shinfo(skb)->frags[0].page) + 2383 skb_shinfo(skb)->frags[0].page_offset + 2384 offset - skb_headlen(skb); 2385 } 2386 EXPORT_SYMBOL(skb_gro_header); 2387 2388 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 2389 { 2390 struct sk_buff **pp = NULL; 2391 struct packet_type *ptype; 2392 __be16 type = skb->protocol; 2393 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 2394 int same_flow; 2395 int mac_len; 2396 int ret; 2397 2398 if (!(skb->dev->features & NETIF_F_GRO)) 2399 goto normal; 2400 2401 if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list) 2402 goto normal; 2403 2404 rcu_read_lock(); 2405 list_for_each_entry_rcu(ptype, head, list) { 2406 if (ptype->type != type || ptype->dev || !ptype->gro_receive) 2407 continue; 2408 2409 skb_set_network_header(skb, skb_gro_offset(skb)); 2410 mac_len = skb->network_header - skb->mac_header; 2411 skb->mac_len = mac_len; 2412 NAPI_GRO_CB(skb)->same_flow = 0; 2413 NAPI_GRO_CB(skb)->flush = 0; 2414 NAPI_GRO_CB(skb)->free = 0; 2415 2416 pp = ptype->gro_receive(&napi->gro_list, skb); 2417 break; 2418 } 2419 rcu_read_unlock(); 2420 2421 if (&ptype->list == head) 2422 goto normal; 2423 2424 same_flow = NAPI_GRO_CB(skb)->same_flow; 2425 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 2426 2427 if (pp) { 2428 struct sk_buff *nskb = *pp; 2429 2430 *pp = nskb->next; 2431 nskb->next = NULL; 2432 napi_gro_complete(nskb); 2433 napi->gro_count--; 2434 } 2435 2436 if (same_flow) 2437 goto ok; 2438 2439 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) 2440 goto normal; 2441 2442 napi->gro_count++; 2443 NAPI_GRO_CB(skb)->count = 1; 2444 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 2445 skb->next = napi->gro_list; 2446 napi->gro_list = skb; 2447 ret = GRO_HELD; 2448 2449 pull: 2450 if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) { 2451 if (napi->gro_list == skb) 2452 napi->gro_list = skb->next; 2453 ret = GRO_DROP; 2454 } 2455 2456 ok: 2457 return ret; 2458 2459 normal: 2460 ret = GRO_NORMAL; 2461 goto pull; 2462 } 2463 EXPORT_SYMBOL(dev_gro_receive); 2464 2465 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 2466 { 2467 struct sk_buff *p; 2468 2469 if (netpoll_rx_on(skb)) 2470 return GRO_NORMAL; 2471 2472 for (p = napi->gro_list; p; p = p->next) { 2473 NAPI_GRO_CB(p)->same_flow = !compare_ether_header( 2474 skb_mac_header(p), skb_gro_mac_header(skb)); 2475 NAPI_GRO_CB(p)->flush = 0; 2476 } 2477 2478 return dev_gro_receive(napi, skb); 2479 } 2480 2481 int napi_skb_finish(int ret, struct sk_buff *skb) 2482 { 2483 int err = NET_RX_SUCCESS; 2484 2485 switch (ret) { 2486 case GRO_NORMAL: 2487 return netif_receive_skb(skb); 2488 2489 case GRO_DROP: 2490 err = NET_RX_DROP; 2491 /* fall through */ 2492 2493 case GRO_MERGED_FREE: 2494 kfree_skb(skb); 2495 break; 2496 } 2497 2498 return err; 2499 } 2500 EXPORT_SYMBOL(napi_skb_finish); 2501 2502 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 2503 { 2504 skb_gro_reset_offset(skb); 2505 2506 return napi_skb_finish(__napi_gro_receive(napi, skb), skb); 2507 } 2508 EXPORT_SYMBOL(napi_gro_receive); 2509 2510 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 2511 { 2512 __skb_pull(skb, skb_headlen(skb)); 2513 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); 2514 2515 napi->skb = skb; 2516 } 2517 EXPORT_SYMBOL(napi_reuse_skb); 2518 2519 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, 2520 struct napi_gro_fraginfo *info) 2521 { 2522 struct net_device *dev = napi->dev; 2523 struct sk_buff *skb = napi->skb; 2524 struct ethhdr *eth; 2525 skb_frag_t *frag; 2526 int i; 2527 2528 napi->skb = NULL; 2529 2530 if (!skb) { 2531 skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); 2532 if (!skb) 2533 goto out; 2534 2535 skb_reserve(skb, NET_IP_ALIGN); 2536 } 2537 2538 BUG_ON(info->nr_frags > MAX_SKB_FRAGS); 2539 frag = &info->frags[info->nr_frags - 1]; 2540 2541 for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) { 2542 skb_fill_page_desc(skb, i, frag->page, frag->page_offset, 2543 frag->size); 2544 frag++; 2545 } 2546 skb_shinfo(skb)->nr_frags = info->nr_frags; 2547 2548 skb->data_len = info->len; 2549 skb->len += info->len; 2550 skb->truesize += info->len; 2551 2552 skb_reset_mac_header(skb); 2553 skb_gro_reset_offset(skb); 2554 2555 eth = skb_gro_header(skb, sizeof(*eth)); 2556 if (!eth) { 2557 napi_reuse_skb(napi, skb); 2558 skb = NULL; 2559 goto out; 2560 } 2561 2562 skb_gro_pull(skb, sizeof(*eth)); 2563 2564 /* 2565 * This works because the only protocols we care about don't require 2566 * special handling. We'll fix it up properly at the end. 2567 */ 2568 skb->protocol = eth->h_proto; 2569 2570 skb->ip_summed = info->ip_summed; 2571 skb->csum = info->csum; 2572 2573 out: 2574 return skb; 2575 } 2576 EXPORT_SYMBOL(napi_fraginfo_skb); 2577 2578 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) 2579 { 2580 int err = NET_RX_SUCCESS; 2581 2582 switch (ret) { 2583 case GRO_NORMAL: 2584 case GRO_HELD: 2585 skb->protocol = eth_type_trans(skb, napi->dev); 2586 2587 if (ret == GRO_NORMAL) 2588 return netif_receive_skb(skb); 2589 2590 skb_gro_pull(skb, -ETH_HLEN); 2591 break; 2592 2593 case GRO_DROP: 2594 err = NET_RX_DROP; 2595 /* fall through */ 2596 2597 case GRO_MERGED_FREE: 2598 napi_reuse_skb(napi, skb); 2599 break; 2600 } 2601 2602 return err; 2603 } 2604 EXPORT_SYMBOL(napi_frags_finish); 2605 2606 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) 2607 { 2608 struct sk_buff *skb = napi_fraginfo_skb(napi, info); 2609 2610 if (!skb) 2611 return NET_RX_DROP; 2612 2613 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); 2614 } 2615 EXPORT_SYMBOL(napi_gro_frags); 2616 2617 static int process_backlog(struct napi_struct *napi, int quota) 2618 { 2619 int work = 0; 2620 struct softnet_data *queue = &__get_cpu_var(softnet_data); 2621 unsigned long start_time = jiffies; 2622 2623 napi->weight = weight_p; 2624 do { 2625 struct sk_buff *skb; 2626 2627 local_irq_disable(); 2628 skb = __skb_dequeue(&queue->input_pkt_queue); 2629 if (!skb) { 2630 __napi_complete(napi); 2631 local_irq_enable(); 2632 break; 2633 } 2634 local_irq_enable(); 2635 2636 netif_receive_skb(skb); 2637 } while (++work < quota && jiffies == start_time); 2638 2639 return work; 2640 } 2641 2642 /** 2643 * __napi_schedule - schedule for receive 2644 * @n: entry to schedule 2645 * 2646 * The entry's receive function will be scheduled to run 2647 */ 2648 void __napi_schedule(struct napi_struct *n) 2649 { 2650 unsigned long flags; 2651 2652 local_irq_save(flags); 2653 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); 2654 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2655 local_irq_restore(flags); 2656 } 2657 EXPORT_SYMBOL(__napi_schedule); 2658 2659 void __napi_complete(struct napi_struct *n) 2660 { 2661 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 2662 BUG_ON(n->gro_list); 2663 2664 list_del(&n->poll_list); 2665 smp_mb__before_clear_bit(); 2666 clear_bit(NAPI_STATE_SCHED, &n->state); 2667 } 2668 EXPORT_SYMBOL(__napi_complete); 2669 2670 void napi_complete(struct napi_struct *n) 2671 { 2672 unsigned long flags; 2673 2674 /* 2675 * don't let napi dequeue from the cpu poll list 2676 * just in case its running on a different cpu 2677 */ 2678 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 2679 return; 2680 2681 napi_gro_flush(n); 2682 local_irq_save(flags); 2683 __napi_complete(n); 2684 local_irq_restore(flags); 2685 } 2686 EXPORT_SYMBOL(napi_complete); 2687 2688 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 2689 int (*poll)(struct napi_struct *, int), int weight) 2690 { 2691 INIT_LIST_HEAD(&napi->poll_list); 2692 napi->gro_count = 0; 2693 napi->gro_list = NULL; 2694 napi->skb = NULL; 2695 napi->poll = poll; 2696 napi->weight = weight; 2697 list_add(&napi->dev_list, &dev->napi_list); 2698 napi->dev = dev; 2699 #ifdef CONFIG_NETPOLL 2700 spin_lock_init(&napi->poll_lock); 2701 napi->poll_owner = -1; 2702 #endif 2703 set_bit(NAPI_STATE_SCHED, &napi->state); 2704 } 2705 EXPORT_SYMBOL(netif_napi_add); 2706 2707 void netif_napi_del(struct napi_struct *napi) 2708 { 2709 struct sk_buff *skb, *next; 2710 2711 list_del_init(&napi->dev_list); 2712 kfree_skb(napi->skb); 2713 2714 for (skb = napi->gro_list; skb; skb = next) { 2715 next = skb->next; 2716 skb->next = NULL; 2717 kfree_skb(skb); 2718 } 2719 2720 napi->gro_list = NULL; 2721 napi->gro_count = 0; 2722 } 2723 EXPORT_SYMBOL(netif_napi_del); 2724 2725 2726 static void net_rx_action(struct softirq_action *h) 2727 { 2728 struct list_head *list = &__get_cpu_var(softnet_data).poll_list; 2729 unsigned long time_limit = jiffies + 2; 2730 int budget = netdev_budget; 2731 void *have; 2732 2733 local_irq_disable(); 2734 2735 while (!list_empty(list)) { 2736 struct napi_struct *n; 2737 int work, weight; 2738 2739 /* If softirq window is exhuasted then punt. 2740 * Allow this to run for 2 jiffies since which will allow 2741 * an average latency of 1.5/HZ. 2742 */ 2743 if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) 2744 goto softnet_break; 2745 2746 local_irq_enable(); 2747 2748 /* Even though interrupts have been re-enabled, this 2749 * access is safe because interrupts can only add new 2750 * entries to the tail of this list, and only ->poll() 2751 * calls can remove this head entry from the list. 2752 */ 2753 n = list_entry(list->next, struct napi_struct, poll_list); 2754 2755 have = netpoll_poll_lock(n); 2756 2757 weight = n->weight; 2758 2759 /* This NAPI_STATE_SCHED test is for avoiding a race 2760 * with netpoll's poll_napi(). Only the entity which 2761 * obtains the lock and sees NAPI_STATE_SCHED set will 2762 * actually make the ->poll() call. Therefore we avoid 2763 * accidently calling ->poll() when NAPI is not scheduled. 2764 */ 2765 work = 0; 2766 if (test_bit(NAPI_STATE_SCHED, &n->state)) 2767 work = n->poll(n, weight); 2768 2769 WARN_ON_ONCE(work > weight); 2770 2771 budget -= work; 2772 2773 local_irq_disable(); 2774 2775 /* Drivers must not modify the NAPI state if they 2776 * consume the entire weight. In such cases this code 2777 * still "owns" the NAPI instance and therefore can 2778 * move the instance around on the list at-will. 2779 */ 2780 if (unlikely(work == weight)) { 2781 if (unlikely(napi_disable_pending(n))) 2782 __napi_complete(n); 2783 else 2784 list_move_tail(&n->poll_list, list); 2785 } 2786 2787 netpoll_poll_unlock(have); 2788 } 2789 out: 2790 local_irq_enable(); 2791 2792 #ifdef CONFIG_NET_DMA 2793 /* 2794 * There may not be any more sk_buffs coming right now, so push 2795 * any pending DMA copies to hardware 2796 */ 2797 dma_issue_pending_all(); 2798 #endif 2799 2800 return; 2801 2802 softnet_break: 2803 __get_cpu_var(netdev_rx_stat).time_squeeze++; 2804 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2805 goto out; 2806 } 2807 2808 static gifconf_func_t * gifconf_list [NPROTO]; 2809 2810 /** 2811 * register_gifconf - register a SIOCGIF handler 2812 * @family: Address family 2813 * @gifconf: Function handler 2814 * 2815 * Register protocol dependent address dumping routines. The handler 2816 * that is passed must not be freed or reused until it has been replaced 2817 * by another handler. 2818 */ 2819 int register_gifconf(unsigned int family, gifconf_func_t * gifconf) 2820 { 2821 if (family >= NPROTO) 2822 return -EINVAL; 2823 gifconf_list[family] = gifconf; 2824 return 0; 2825 } 2826 2827 2828 /* 2829 * Map an interface index to its name (SIOCGIFNAME) 2830 */ 2831 2832 /* 2833 * We need this ioctl for efficient implementation of the 2834 * if_indextoname() function required by the IPv6 API. Without 2835 * it, we would have to search all the interfaces to find a 2836 * match. --pb 2837 */ 2838 2839 static int dev_ifname(struct net *net, struct ifreq __user *arg) 2840 { 2841 struct net_device *dev; 2842 struct ifreq ifr; 2843 2844 /* 2845 * Fetch the caller's info block. 2846 */ 2847 2848 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 2849 return -EFAULT; 2850 2851 read_lock(&dev_base_lock); 2852 dev = __dev_get_by_index(net, ifr.ifr_ifindex); 2853 if (!dev) { 2854 read_unlock(&dev_base_lock); 2855 return -ENODEV; 2856 } 2857 2858 strcpy(ifr.ifr_name, dev->name); 2859 read_unlock(&dev_base_lock); 2860 2861 if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) 2862 return -EFAULT; 2863 return 0; 2864 } 2865 2866 /* 2867 * Perform a SIOCGIFCONF call. This structure will change 2868 * size eventually, and there is nothing I can do about it. 2869 * Thus we will need a 'compatibility mode'. 2870 */ 2871 2872 static int dev_ifconf(struct net *net, char __user *arg) 2873 { 2874 struct ifconf ifc; 2875 struct net_device *dev; 2876 char __user *pos; 2877 int len; 2878 int total; 2879 int i; 2880 2881 /* 2882 * Fetch the caller's info block. 2883 */ 2884 2885 if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) 2886 return -EFAULT; 2887 2888 pos = ifc.ifc_buf; 2889 len = ifc.ifc_len; 2890 2891 /* 2892 * Loop over the interfaces, and write an info block for each. 2893 */ 2894 2895 total = 0; 2896 for_each_netdev(net, dev) { 2897 for (i = 0; i < NPROTO; i++) { 2898 if (gifconf_list[i]) { 2899 int done; 2900 if (!pos) 2901 done = gifconf_list[i](dev, NULL, 0); 2902 else 2903 done = gifconf_list[i](dev, pos + total, 2904 len - total); 2905 if (done < 0) 2906 return -EFAULT; 2907 total += done; 2908 } 2909 } 2910 } 2911 2912 /* 2913 * All done. Write the updated control block back to the caller. 2914 */ 2915 ifc.ifc_len = total; 2916 2917 /* 2918 * Both BSD and Solaris return 0 here, so we do too. 2919 */ 2920 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; 2921 } 2922 2923 #ifdef CONFIG_PROC_FS 2924 /* 2925 * This is invoked by the /proc filesystem handler to display a device 2926 * in detail. 2927 */ 2928 void *dev_seq_start(struct seq_file *seq, loff_t *pos) 2929 __acquires(dev_base_lock) 2930 { 2931 struct net *net = seq_file_net(seq); 2932 loff_t off; 2933 struct net_device *dev; 2934 2935 read_lock(&dev_base_lock); 2936 if (!*pos) 2937 return SEQ_START_TOKEN; 2938 2939 off = 1; 2940 for_each_netdev(net, dev) 2941 if (off++ == *pos) 2942 return dev; 2943 2944 return NULL; 2945 } 2946 2947 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2948 { 2949 struct net *net = seq_file_net(seq); 2950 ++*pos; 2951 return v == SEQ_START_TOKEN ? 2952 first_net_device(net) : next_net_device((struct net_device *)v); 2953 } 2954 2955 void dev_seq_stop(struct seq_file *seq, void *v) 2956 __releases(dev_base_lock) 2957 { 2958 read_unlock(&dev_base_lock); 2959 } 2960 2961 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) 2962 { 2963 const struct net_device_stats *stats = dev_get_stats(dev); 2964 2965 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " 2966 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", 2967 dev->name, stats->rx_bytes, stats->rx_packets, 2968 stats->rx_errors, 2969 stats->rx_dropped + stats->rx_missed_errors, 2970 stats->rx_fifo_errors, 2971 stats->rx_length_errors + stats->rx_over_errors + 2972 stats->rx_crc_errors + stats->rx_frame_errors, 2973 stats->rx_compressed, stats->multicast, 2974 stats->tx_bytes, stats->tx_packets, 2975 stats->tx_errors, stats->tx_dropped, 2976 stats->tx_fifo_errors, stats->collisions, 2977 stats->tx_carrier_errors + 2978 stats->tx_aborted_errors + 2979 stats->tx_window_errors + 2980 stats->tx_heartbeat_errors, 2981 stats->tx_compressed); 2982 } 2983 2984 /* 2985 * Called from the PROCfs module. This now uses the new arbitrary sized 2986 * /proc/net interface to create /proc/net/dev 2987 */ 2988 static int dev_seq_show(struct seq_file *seq, void *v) 2989 { 2990 if (v == SEQ_START_TOKEN) 2991 seq_puts(seq, "Inter-| Receive " 2992 " | Transmit\n" 2993 " face |bytes packets errs drop fifo frame " 2994 "compressed multicast|bytes packets errs " 2995 "drop fifo colls carrier compressed\n"); 2996 else 2997 dev_seq_printf_stats(seq, v); 2998 return 0; 2999 } 3000 3001 static struct netif_rx_stats *softnet_get_online(loff_t *pos) 3002 { 3003 struct netif_rx_stats *rc = NULL; 3004 3005 while (*pos < nr_cpu_ids) 3006 if (cpu_online(*pos)) { 3007 rc = &per_cpu(netdev_rx_stat, *pos); 3008 break; 3009 } else 3010 ++*pos; 3011 return rc; 3012 } 3013 3014 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) 3015 { 3016 return softnet_get_online(pos); 3017 } 3018 3019 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3020 { 3021 ++*pos; 3022 return softnet_get_online(pos); 3023 } 3024 3025 static void softnet_seq_stop(struct seq_file *seq, void *v) 3026 { 3027 } 3028 3029 static int softnet_seq_show(struct seq_file *seq, void *v) 3030 { 3031 struct netif_rx_stats *s = v; 3032 3033 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 3034 s->total, s->dropped, s->time_squeeze, 0, 3035 0, 0, 0, 0, /* was fastroute */ 3036 s->cpu_collision ); 3037 return 0; 3038 } 3039 3040 static const struct seq_operations dev_seq_ops = { 3041 .start = dev_seq_start, 3042 .next = dev_seq_next, 3043 .stop = dev_seq_stop, 3044 .show = dev_seq_show, 3045 }; 3046 3047 static int dev_seq_open(struct inode *inode, struct file *file) 3048 { 3049 return seq_open_net(inode, file, &dev_seq_ops, 3050 sizeof(struct seq_net_private)); 3051 } 3052 3053 static const struct file_operations dev_seq_fops = { 3054 .owner = THIS_MODULE, 3055 .open = dev_seq_open, 3056 .read = seq_read, 3057 .llseek = seq_lseek, 3058 .release = seq_release_net, 3059 }; 3060 3061 static const struct seq_operations softnet_seq_ops = { 3062 .start = softnet_seq_start, 3063 .next = softnet_seq_next, 3064 .stop = softnet_seq_stop, 3065 .show = softnet_seq_show, 3066 }; 3067 3068 static int softnet_seq_open(struct inode *inode, struct file *file) 3069 { 3070 return seq_open(file, &softnet_seq_ops); 3071 } 3072 3073 static const struct file_operations softnet_seq_fops = { 3074 .owner = THIS_MODULE, 3075 .open = softnet_seq_open, 3076 .read = seq_read, 3077 .llseek = seq_lseek, 3078 .release = seq_release, 3079 }; 3080 3081 static void *ptype_get_idx(loff_t pos) 3082 { 3083 struct packet_type *pt = NULL; 3084 loff_t i = 0; 3085 int t; 3086 3087 list_for_each_entry_rcu(pt, &ptype_all, list) { 3088 if (i == pos) 3089 return pt; 3090 ++i; 3091 } 3092 3093 for (t = 0; t < PTYPE_HASH_SIZE; t++) { 3094 list_for_each_entry_rcu(pt, &ptype_base[t], list) { 3095 if (i == pos) 3096 return pt; 3097 ++i; 3098 } 3099 } 3100 return NULL; 3101 } 3102 3103 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) 3104 __acquires(RCU) 3105 { 3106 rcu_read_lock(); 3107 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; 3108 } 3109 3110 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3111 { 3112 struct packet_type *pt; 3113 struct list_head *nxt; 3114 int hash; 3115 3116 ++*pos; 3117 if (v == SEQ_START_TOKEN) 3118 return ptype_get_idx(0); 3119 3120 pt = v; 3121 nxt = pt->list.next; 3122 if (pt->type == htons(ETH_P_ALL)) { 3123 if (nxt != &ptype_all) 3124 goto found; 3125 hash = 0; 3126 nxt = ptype_base[0].next; 3127 } else 3128 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 3129 3130 while (nxt == &ptype_base[hash]) { 3131 if (++hash >= PTYPE_HASH_SIZE) 3132 return NULL; 3133 nxt = ptype_base[hash].next; 3134 } 3135 found: 3136 return list_entry(nxt, struct packet_type, list); 3137 } 3138 3139 static void ptype_seq_stop(struct seq_file *seq, void *v) 3140 __releases(RCU) 3141 { 3142 rcu_read_unlock(); 3143 } 3144 3145 static int ptype_seq_show(struct seq_file *seq, void *v) 3146 { 3147 struct packet_type *pt = v; 3148 3149 if (v == SEQ_START_TOKEN) 3150 seq_puts(seq, "Type Device Function\n"); 3151 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { 3152 if (pt->type == htons(ETH_P_ALL)) 3153 seq_puts(seq, "ALL "); 3154 else 3155 seq_printf(seq, "%04x", ntohs(pt->type)); 3156 3157 seq_printf(seq, " %-8s %pF\n", 3158 pt->dev ? pt->dev->name : "", pt->func); 3159 } 3160 3161 return 0; 3162 } 3163 3164 static const struct seq_operations ptype_seq_ops = { 3165 .start = ptype_seq_start, 3166 .next = ptype_seq_next, 3167 .stop = ptype_seq_stop, 3168 .show = ptype_seq_show, 3169 }; 3170 3171 static int ptype_seq_open(struct inode *inode, struct file *file) 3172 { 3173 return seq_open_net(inode, file, &ptype_seq_ops, 3174 sizeof(struct seq_net_private)); 3175 } 3176 3177 static const struct file_operations ptype_seq_fops = { 3178 .owner = THIS_MODULE, 3179 .open = ptype_seq_open, 3180 .read = seq_read, 3181 .llseek = seq_lseek, 3182 .release = seq_release_net, 3183 }; 3184 3185 3186 static int __net_init dev_proc_net_init(struct net *net) 3187 { 3188 int rc = -ENOMEM; 3189 3190 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) 3191 goto out; 3192 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) 3193 goto out_dev; 3194 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) 3195 goto out_softnet; 3196 3197 if (wext_proc_init(net)) 3198 goto out_ptype; 3199 rc = 0; 3200 out: 3201 return rc; 3202 out_ptype: 3203 proc_net_remove(net, "ptype"); 3204 out_softnet: 3205 proc_net_remove(net, "softnet_stat"); 3206 out_dev: 3207 proc_net_remove(net, "dev"); 3208 goto out; 3209 } 3210 3211 static void __net_exit dev_proc_net_exit(struct net *net) 3212 { 3213 wext_proc_exit(net); 3214 3215 proc_net_remove(net, "ptype"); 3216 proc_net_remove(net, "softnet_stat"); 3217 proc_net_remove(net, "dev"); 3218 } 3219 3220 static struct pernet_operations __net_initdata dev_proc_ops = { 3221 .init = dev_proc_net_init, 3222 .exit = dev_proc_net_exit, 3223 }; 3224 3225 static int __init dev_proc_init(void) 3226 { 3227 return register_pernet_subsys(&dev_proc_ops); 3228 } 3229 #else 3230 #define dev_proc_init() 0 3231 #endif /* CONFIG_PROC_FS */ 3232 3233 3234 /** 3235 * netdev_set_master - set up master/slave pair 3236 * @slave: slave device 3237 * @master: new master device 3238 * 3239 * Changes the master device of the slave. Pass %NULL to break the 3240 * bonding. The caller must hold the RTNL semaphore. On a failure 3241 * a negative errno code is returned. On success the reference counts 3242 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the 3243 * function returns zero. 3244 */ 3245 int netdev_set_master(struct net_device *slave, struct net_device *master) 3246 { 3247 struct net_device *old = slave->master; 3248 3249 ASSERT_RTNL(); 3250 3251 if (master) { 3252 if (old) 3253 return -EBUSY; 3254 dev_hold(master); 3255 } 3256 3257 slave->master = master; 3258 3259 synchronize_net(); 3260 3261 if (old) 3262 dev_put(old); 3263 3264 if (master) 3265 slave->flags |= IFF_SLAVE; 3266 else 3267 slave->flags &= ~IFF_SLAVE; 3268 3269 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); 3270 return 0; 3271 } 3272 3273 static void dev_change_rx_flags(struct net_device *dev, int flags) 3274 { 3275 const struct net_device_ops *ops = dev->netdev_ops; 3276 3277 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) 3278 ops->ndo_change_rx_flags(dev, flags); 3279 } 3280 3281 static int __dev_set_promiscuity(struct net_device *dev, int inc) 3282 { 3283 unsigned short old_flags = dev->flags; 3284 uid_t uid; 3285 gid_t gid; 3286 3287 ASSERT_RTNL(); 3288 3289 dev->flags |= IFF_PROMISC; 3290 dev->promiscuity += inc; 3291 if (dev->promiscuity == 0) { 3292 /* 3293 * Avoid overflow. 3294 * If inc causes overflow, untouch promisc and return error. 3295 */ 3296 if (inc < 0) 3297 dev->flags &= ~IFF_PROMISC; 3298 else { 3299 dev->promiscuity -= inc; 3300 printk(KERN_WARNING "%s: promiscuity touches roof, " 3301 "set promiscuity failed, promiscuity feature " 3302 "of device might be broken.\n", dev->name); 3303 return -EOVERFLOW; 3304 } 3305 } 3306 if (dev->flags != old_flags) { 3307 printk(KERN_INFO "device %s %s promiscuous mode\n", 3308 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 3309 "left"); 3310 if (audit_enabled) { 3311 current_uid_gid(&uid, &gid); 3312 audit_log(current->audit_context, GFP_ATOMIC, 3313 AUDIT_ANOM_PROMISCUOUS, 3314 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 3315 dev->name, (dev->flags & IFF_PROMISC), 3316 (old_flags & IFF_PROMISC), 3317 audit_get_loginuid(current), 3318 uid, gid, 3319 audit_get_sessionid(current)); 3320 } 3321 3322 dev_change_rx_flags(dev, IFF_PROMISC); 3323 } 3324 return 0; 3325 } 3326 3327 /** 3328 * dev_set_promiscuity - update promiscuity count on a device 3329 * @dev: device 3330 * @inc: modifier 3331 * 3332 * Add or remove promiscuity from a device. While the count in the device 3333 * remains above zero the interface remains promiscuous. Once it hits zero 3334 * the device reverts back to normal filtering operation. A negative inc 3335 * value is used to drop promiscuity on the device. 3336 * Return 0 if successful or a negative errno code on error. 3337 */ 3338 int dev_set_promiscuity(struct net_device *dev, int inc) 3339 { 3340 unsigned short old_flags = dev->flags; 3341 int err; 3342 3343 err = __dev_set_promiscuity(dev, inc); 3344 if (err < 0) 3345 return err; 3346 if (dev->flags != old_flags) 3347 dev_set_rx_mode(dev); 3348 return err; 3349 } 3350 3351 /** 3352 * dev_set_allmulti - update allmulti count on a device 3353 * @dev: device 3354 * @inc: modifier 3355 * 3356 * Add or remove reception of all multicast frames to a device. While the 3357 * count in the device remains above zero the interface remains listening 3358 * to all interfaces. Once it hits zero the device reverts back to normal 3359 * filtering operation. A negative @inc value is used to drop the counter 3360 * when releasing a resource needing all multicasts. 3361 * Return 0 if successful or a negative errno code on error. 3362 */ 3363 3364 int dev_set_allmulti(struct net_device *dev, int inc) 3365 { 3366 unsigned short old_flags = dev->flags; 3367 3368 ASSERT_RTNL(); 3369 3370 dev->flags |= IFF_ALLMULTI; 3371 dev->allmulti += inc; 3372 if (dev->allmulti == 0) { 3373 /* 3374 * Avoid overflow. 3375 * If inc causes overflow, untouch allmulti and return error. 3376 */ 3377 if (inc < 0) 3378 dev->flags &= ~IFF_ALLMULTI; 3379 else { 3380 dev->allmulti -= inc; 3381 printk(KERN_WARNING "%s: allmulti touches roof, " 3382 "set allmulti failed, allmulti feature of " 3383 "device might be broken.\n", dev->name); 3384 return -EOVERFLOW; 3385 } 3386 } 3387 if (dev->flags ^ old_flags) { 3388 dev_change_rx_flags(dev, IFF_ALLMULTI); 3389 dev_set_rx_mode(dev); 3390 } 3391 return 0; 3392 } 3393 3394 /* 3395 * Upload unicast and multicast address lists to device and 3396 * configure RX filtering. When the device doesn't support unicast 3397 * filtering it is put in promiscuous mode while unicast addresses 3398 * are present. 3399 */ 3400 void __dev_set_rx_mode(struct net_device *dev) 3401 { 3402 const struct net_device_ops *ops = dev->netdev_ops; 3403 3404 /* dev_open will call this function so the list will stay sane. */ 3405 if (!(dev->flags&IFF_UP)) 3406 return; 3407 3408 if (!netif_device_present(dev)) 3409 return; 3410 3411 if (ops->ndo_set_rx_mode) 3412 ops->ndo_set_rx_mode(dev); 3413 else { 3414 /* Unicast addresses changes may only happen under the rtnl, 3415 * therefore calling __dev_set_promiscuity here is safe. 3416 */ 3417 if (dev->uc_count > 0 && !dev->uc_promisc) { 3418 __dev_set_promiscuity(dev, 1); 3419 dev->uc_promisc = 1; 3420 } else if (dev->uc_count == 0 && dev->uc_promisc) { 3421 __dev_set_promiscuity(dev, -1); 3422 dev->uc_promisc = 0; 3423 } 3424 3425 if (ops->ndo_set_multicast_list) 3426 ops->ndo_set_multicast_list(dev); 3427 } 3428 } 3429 3430 void dev_set_rx_mode(struct net_device *dev) 3431 { 3432 netif_addr_lock_bh(dev); 3433 __dev_set_rx_mode(dev); 3434 netif_addr_unlock_bh(dev); 3435 } 3436 3437 int __dev_addr_delete(struct dev_addr_list **list, int *count, 3438 void *addr, int alen, int glbl) 3439 { 3440 struct dev_addr_list *da; 3441 3442 for (; (da = *list) != NULL; list = &da->next) { 3443 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && 3444 alen == da->da_addrlen) { 3445 if (glbl) { 3446 int old_glbl = da->da_gusers; 3447 da->da_gusers = 0; 3448 if (old_glbl == 0) 3449 break; 3450 } 3451 if (--da->da_users) 3452 return 0; 3453 3454 *list = da->next; 3455 kfree(da); 3456 (*count)--; 3457 return 0; 3458 } 3459 } 3460 return -ENOENT; 3461 } 3462 3463 int __dev_addr_add(struct dev_addr_list **list, int *count, 3464 void *addr, int alen, int glbl) 3465 { 3466 struct dev_addr_list *da; 3467 3468 for (da = *list; da != NULL; da = da->next) { 3469 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && 3470 da->da_addrlen == alen) { 3471 if (glbl) { 3472 int old_glbl = da->da_gusers; 3473 da->da_gusers = 1; 3474 if (old_glbl) 3475 return 0; 3476 } 3477 da->da_users++; 3478 return 0; 3479 } 3480 } 3481 3482 da = kzalloc(sizeof(*da), GFP_ATOMIC); 3483 if (da == NULL) 3484 return -ENOMEM; 3485 memcpy(da->da_addr, addr, alen); 3486 da->da_addrlen = alen; 3487 da->da_users = 1; 3488 da->da_gusers = glbl ? 1 : 0; 3489 da->next = *list; 3490 *list = da; 3491 (*count)++; 3492 return 0; 3493 } 3494 3495 /** 3496 * dev_unicast_delete - Release secondary unicast address. 3497 * @dev: device 3498 * @addr: address to delete 3499 * @alen: length of @addr 3500 * 3501 * Release reference to a secondary unicast address and remove it 3502 * from the device if the reference count drops to zero. 3503 * 3504 * The caller must hold the rtnl_mutex. 3505 */ 3506 int dev_unicast_delete(struct net_device *dev, void *addr, int alen) 3507 { 3508 int err; 3509 3510 ASSERT_RTNL(); 3511 3512 netif_addr_lock_bh(dev); 3513 err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0); 3514 if (!err) 3515 __dev_set_rx_mode(dev); 3516 netif_addr_unlock_bh(dev); 3517 return err; 3518 } 3519 EXPORT_SYMBOL(dev_unicast_delete); 3520 3521 /** 3522 * dev_unicast_add - add a secondary unicast address 3523 * @dev: device 3524 * @addr: address to add 3525 * @alen: length of @addr 3526 * 3527 * Add a secondary unicast address to the device or increase 3528 * the reference count if it already exists. 3529 * 3530 * The caller must hold the rtnl_mutex. 3531 */ 3532 int dev_unicast_add(struct net_device *dev, void *addr, int alen) 3533 { 3534 int err; 3535 3536 ASSERT_RTNL(); 3537 3538 netif_addr_lock_bh(dev); 3539 err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0); 3540 if (!err) 3541 __dev_set_rx_mode(dev); 3542 netif_addr_unlock_bh(dev); 3543 return err; 3544 } 3545 EXPORT_SYMBOL(dev_unicast_add); 3546 3547 int __dev_addr_sync(struct dev_addr_list **to, int *to_count, 3548 struct dev_addr_list **from, int *from_count) 3549 { 3550 struct dev_addr_list *da, *next; 3551 int err = 0; 3552 3553 da = *from; 3554 while (da != NULL) { 3555 next = da->next; 3556 if (!da->da_synced) { 3557 err = __dev_addr_add(to, to_count, 3558 da->da_addr, da->da_addrlen, 0); 3559 if (err < 0) 3560 break; 3561 da->da_synced = 1; 3562 da->da_users++; 3563 } else if (da->da_users == 1) { 3564 __dev_addr_delete(to, to_count, 3565 da->da_addr, da->da_addrlen, 0); 3566 __dev_addr_delete(from, from_count, 3567 da->da_addr, da->da_addrlen, 0); 3568 } 3569 da = next; 3570 } 3571 return err; 3572 } 3573 3574 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, 3575 struct dev_addr_list **from, int *from_count) 3576 { 3577 struct dev_addr_list *da, *next; 3578 3579 da = *from; 3580 while (da != NULL) { 3581 next = da->next; 3582 if (da->da_synced) { 3583 __dev_addr_delete(to, to_count, 3584 da->da_addr, da->da_addrlen, 0); 3585 da->da_synced = 0; 3586 __dev_addr_delete(from, from_count, 3587 da->da_addr, da->da_addrlen, 0); 3588 } 3589 da = next; 3590 } 3591 } 3592 3593 /** 3594 * dev_unicast_sync - Synchronize device's unicast list to another device 3595 * @to: destination device 3596 * @from: source device 3597 * 3598 * Add newly added addresses to the destination device and release 3599 * addresses that have no users left. The source device must be 3600 * locked by netif_tx_lock_bh. 3601 * 3602 * This function is intended to be called from the dev->set_rx_mode 3603 * function of layered software devices. 3604 */ 3605 int dev_unicast_sync(struct net_device *to, struct net_device *from) 3606 { 3607 int err = 0; 3608 3609 netif_addr_lock_bh(to); 3610 err = __dev_addr_sync(&to->uc_list, &to->uc_count, 3611 &from->uc_list, &from->uc_count); 3612 if (!err) 3613 __dev_set_rx_mode(to); 3614 netif_addr_unlock_bh(to); 3615 return err; 3616 } 3617 EXPORT_SYMBOL(dev_unicast_sync); 3618 3619 /** 3620 * dev_unicast_unsync - Remove synchronized addresses from the destination device 3621 * @to: destination device 3622 * @from: source device 3623 * 3624 * Remove all addresses that were added to the destination device by 3625 * dev_unicast_sync(). This function is intended to be called from the 3626 * dev->stop function of layered software devices. 3627 */ 3628 void dev_unicast_unsync(struct net_device *to, struct net_device *from) 3629 { 3630 netif_addr_lock_bh(from); 3631 netif_addr_lock(to); 3632 3633 __dev_addr_unsync(&to->uc_list, &to->uc_count, 3634 &from->uc_list, &from->uc_count); 3635 __dev_set_rx_mode(to); 3636 3637 netif_addr_unlock(to); 3638 netif_addr_unlock_bh(from); 3639 } 3640 EXPORT_SYMBOL(dev_unicast_unsync); 3641 3642 static void __dev_addr_discard(struct dev_addr_list **list) 3643 { 3644 struct dev_addr_list *tmp; 3645 3646 while (*list != NULL) { 3647 tmp = *list; 3648 *list = tmp->next; 3649 if (tmp->da_users > tmp->da_gusers) 3650 printk("__dev_addr_discard: address leakage! " 3651 "da_users=%d\n", tmp->da_users); 3652 kfree(tmp); 3653 } 3654 } 3655 3656 static void dev_addr_discard(struct net_device *dev) 3657 { 3658 netif_addr_lock_bh(dev); 3659 3660 __dev_addr_discard(&dev->uc_list); 3661 dev->uc_count = 0; 3662 3663 __dev_addr_discard(&dev->mc_list); 3664 dev->mc_count = 0; 3665 3666 netif_addr_unlock_bh(dev); 3667 } 3668 3669 /** 3670 * dev_get_flags - get flags reported to userspace 3671 * @dev: device 3672 * 3673 * Get the combination of flag bits exported through APIs to userspace. 3674 */ 3675 unsigned dev_get_flags(const struct net_device *dev) 3676 { 3677 unsigned flags; 3678 3679 flags = (dev->flags & ~(IFF_PROMISC | 3680 IFF_ALLMULTI | 3681 IFF_RUNNING | 3682 IFF_LOWER_UP | 3683 IFF_DORMANT)) | 3684 (dev->gflags & (IFF_PROMISC | 3685 IFF_ALLMULTI)); 3686 3687 if (netif_running(dev)) { 3688 if (netif_oper_up(dev)) 3689 flags |= IFF_RUNNING; 3690 if (netif_carrier_ok(dev)) 3691 flags |= IFF_LOWER_UP; 3692 if (netif_dormant(dev)) 3693 flags |= IFF_DORMANT; 3694 } 3695 3696 return flags; 3697 } 3698 3699 /** 3700 * dev_change_flags - change device settings 3701 * @dev: device 3702 * @flags: device state flags 3703 * 3704 * Change settings on device based state flags. The flags are 3705 * in the userspace exported format. 3706 */ 3707 int dev_change_flags(struct net_device *dev, unsigned flags) 3708 { 3709 int ret, changes; 3710 int old_flags = dev->flags; 3711 3712 ASSERT_RTNL(); 3713 3714 /* 3715 * Set the flags on our device. 3716 */ 3717 3718 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 3719 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 3720 IFF_AUTOMEDIA)) | 3721 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 3722 IFF_ALLMULTI)); 3723 3724 /* 3725 * Load in the correct multicast list now the flags have changed. 3726 */ 3727 3728 if ((old_flags ^ flags) & IFF_MULTICAST) 3729 dev_change_rx_flags(dev, IFF_MULTICAST); 3730 3731 dev_set_rx_mode(dev); 3732 3733 /* 3734 * Have we downed the interface. We handle IFF_UP ourselves 3735 * according to user attempts to set it, rather than blindly 3736 * setting it. 3737 */ 3738 3739 ret = 0; 3740 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ 3741 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); 3742 3743 if (!ret) 3744 dev_set_rx_mode(dev); 3745 } 3746 3747 if (dev->flags & IFF_UP && 3748 ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | 3749 IFF_VOLATILE))) 3750 call_netdevice_notifiers(NETDEV_CHANGE, dev); 3751 3752 if ((flags ^ dev->gflags) & IFF_PROMISC) { 3753 int inc = (flags & IFF_PROMISC) ? +1 : -1; 3754 dev->gflags ^= IFF_PROMISC; 3755 dev_set_promiscuity(dev, inc); 3756 } 3757 3758 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 3759 is important. Some (broken) drivers set IFF_PROMISC, when 3760 IFF_ALLMULTI is requested not asking us and not reporting. 3761 */ 3762 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 3763 int inc = (flags & IFF_ALLMULTI) ? +1 : -1; 3764 dev->gflags ^= IFF_ALLMULTI; 3765 dev_set_allmulti(dev, inc); 3766 } 3767 3768 /* Exclude state transition flags, already notified */ 3769 changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); 3770 if (changes) 3771 rtmsg_ifinfo(RTM_NEWLINK, dev, changes); 3772 3773 return ret; 3774 } 3775 3776 /** 3777 * dev_set_mtu - Change maximum transfer unit 3778 * @dev: device 3779 * @new_mtu: new transfer unit 3780 * 3781 * Change the maximum transfer size of the network device. 3782 */ 3783 int dev_set_mtu(struct net_device *dev, int new_mtu) 3784 { 3785 const struct net_device_ops *ops = dev->netdev_ops; 3786 int err; 3787 3788 if (new_mtu == dev->mtu) 3789 return 0; 3790 3791 /* MTU must be positive. */ 3792 if (new_mtu < 0) 3793 return -EINVAL; 3794 3795 if (!netif_device_present(dev)) 3796 return -ENODEV; 3797 3798 err = 0; 3799 if (ops->ndo_change_mtu) 3800 err = ops->ndo_change_mtu(dev, new_mtu); 3801 else 3802 dev->mtu = new_mtu; 3803 3804 if (!err && dev->flags & IFF_UP) 3805 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 3806 return err; 3807 } 3808 3809 /** 3810 * dev_set_mac_address - Change Media Access Control Address 3811 * @dev: device 3812 * @sa: new address 3813 * 3814 * Change the hardware (MAC) address of the device 3815 */ 3816 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 3817 { 3818 const struct net_device_ops *ops = dev->netdev_ops; 3819 int err; 3820 3821 if (!ops->ndo_set_mac_address) 3822 return -EOPNOTSUPP; 3823 if (sa->sa_family != dev->type) 3824 return -EINVAL; 3825 if (!netif_device_present(dev)) 3826 return -ENODEV; 3827 err = ops->ndo_set_mac_address(dev, sa); 3828 if (!err) 3829 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 3830 return err; 3831 } 3832 3833 /* 3834 * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) 3835 */ 3836 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) 3837 { 3838 int err; 3839 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 3840 3841 if (!dev) 3842 return -ENODEV; 3843 3844 switch (cmd) { 3845 case SIOCGIFFLAGS: /* Get interface flags */ 3846 ifr->ifr_flags = dev_get_flags(dev); 3847 return 0; 3848 3849 case SIOCGIFMETRIC: /* Get the metric on the interface 3850 (currently unused) */ 3851 ifr->ifr_metric = 0; 3852 return 0; 3853 3854 case SIOCGIFMTU: /* Get the MTU of a device */ 3855 ifr->ifr_mtu = dev->mtu; 3856 return 0; 3857 3858 case SIOCGIFHWADDR: 3859 if (!dev->addr_len) 3860 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); 3861 else 3862 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, 3863 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 3864 ifr->ifr_hwaddr.sa_family = dev->type; 3865 return 0; 3866 3867 case SIOCGIFSLAVE: 3868 err = -EINVAL; 3869 break; 3870 3871 case SIOCGIFMAP: 3872 ifr->ifr_map.mem_start = dev->mem_start; 3873 ifr->ifr_map.mem_end = dev->mem_end; 3874 ifr->ifr_map.base_addr = dev->base_addr; 3875 ifr->ifr_map.irq = dev->irq; 3876 ifr->ifr_map.dma = dev->dma; 3877 ifr->ifr_map.port = dev->if_port; 3878 return 0; 3879 3880 case SIOCGIFINDEX: 3881 ifr->ifr_ifindex = dev->ifindex; 3882 return 0; 3883 3884 case SIOCGIFTXQLEN: 3885 ifr->ifr_qlen = dev->tx_queue_len; 3886 return 0; 3887 3888 default: 3889 /* dev_ioctl() should ensure this case 3890 * is never reached 3891 */ 3892 WARN_ON(1); 3893 err = -EINVAL; 3894 break; 3895 3896 } 3897 return err; 3898 } 3899 3900 /* 3901 * Perform the SIOCxIFxxx calls, inside rtnl_lock() 3902 */ 3903 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) 3904 { 3905 int err; 3906 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 3907 const struct net_device_ops *ops; 3908 3909 if (!dev) 3910 return -ENODEV; 3911 3912 ops = dev->netdev_ops; 3913 3914 switch (cmd) { 3915 case SIOCSIFFLAGS: /* Set interface flags */ 3916 return dev_change_flags(dev, ifr->ifr_flags); 3917 3918 case SIOCSIFMETRIC: /* Set the metric on the interface 3919 (currently unused) */ 3920 return -EOPNOTSUPP; 3921 3922 case SIOCSIFMTU: /* Set the MTU of a device */ 3923 return dev_set_mtu(dev, ifr->ifr_mtu); 3924 3925 case SIOCSIFHWADDR: 3926 return dev_set_mac_address(dev, &ifr->ifr_hwaddr); 3927 3928 case SIOCSIFHWBROADCAST: 3929 if (ifr->ifr_hwaddr.sa_family != dev->type) 3930 return -EINVAL; 3931 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, 3932 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 3933 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 3934 return 0; 3935 3936 case SIOCSIFMAP: 3937 if (ops->ndo_set_config) { 3938 if (!netif_device_present(dev)) 3939 return -ENODEV; 3940 return ops->ndo_set_config(dev, &ifr->ifr_map); 3941 } 3942 return -EOPNOTSUPP; 3943 3944 case SIOCADDMULTI: 3945 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 3946 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 3947 return -EINVAL; 3948 if (!netif_device_present(dev)) 3949 return -ENODEV; 3950 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, 3951 dev->addr_len, 1); 3952 3953 case SIOCDELMULTI: 3954 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 3955 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 3956 return -EINVAL; 3957 if (!netif_device_present(dev)) 3958 return -ENODEV; 3959 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, 3960 dev->addr_len, 1); 3961 3962 case SIOCSIFTXQLEN: 3963 if (ifr->ifr_qlen < 0) 3964 return -EINVAL; 3965 dev->tx_queue_len = ifr->ifr_qlen; 3966 return 0; 3967 3968 case SIOCSIFNAME: 3969 ifr->ifr_newname[IFNAMSIZ-1] = '\0'; 3970 return dev_change_name(dev, ifr->ifr_newname); 3971 3972 /* 3973 * Unknown or private ioctl 3974 */ 3975 3976 default: 3977 if ((cmd >= SIOCDEVPRIVATE && 3978 cmd <= SIOCDEVPRIVATE + 15) || 3979 cmd == SIOCBONDENSLAVE || 3980 cmd == SIOCBONDRELEASE || 3981 cmd == SIOCBONDSETHWADDR || 3982 cmd == SIOCBONDSLAVEINFOQUERY || 3983 cmd == SIOCBONDINFOQUERY || 3984 cmd == SIOCBONDCHANGEACTIVE || 3985 cmd == SIOCGMIIPHY || 3986 cmd == SIOCGMIIREG || 3987 cmd == SIOCSMIIREG || 3988 cmd == SIOCBRADDIF || 3989 cmd == SIOCBRDELIF || 3990 cmd == SIOCSHWTSTAMP || 3991 cmd == SIOCWANDEV) { 3992 err = -EOPNOTSUPP; 3993 if (ops->ndo_do_ioctl) { 3994 if (netif_device_present(dev)) 3995 err = ops->ndo_do_ioctl(dev, ifr, cmd); 3996 else 3997 err = -ENODEV; 3998 } 3999 } else 4000 err = -EINVAL; 4001 4002 } 4003 return err; 4004 } 4005 4006 /* 4007 * This function handles all "interface"-type I/O control requests. The actual 4008 * 'doing' part of this is dev_ifsioc above. 4009 */ 4010 4011 /** 4012 * dev_ioctl - network device ioctl 4013 * @net: the applicable net namespace 4014 * @cmd: command to issue 4015 * @arg: pointer to a struct ifreq in user space 4016 * 4017 * Issue ioctl functions to devices. This is normally called by the 4018 * user space syscall interfaces but can sometimes be useful for 4019 * other purposes. The return value is the return from the syscall if 4020 * positive or a negative errno code on error. 4021 */ 4022 4023 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) 4024 { 4025 struct ifreq ifr; 4026 int ret; 4027 char *colon; 4028 4029 /* One special case: SIOCGIFCONF takes ifconf argument 4030 and requires shared lock, because it sleeps writing 4031 to user space. 4032 */ 4033 4034 if (cmd == SIOCGIFCONF) { 4035 rtnl_lock(); 4036 ret = dev_ifconf(net, (char __user *) arg); 4037 rtnl_unlock(); 4038 return ret; 4039 } 4040 if (cmd == SIOCGIFNAME) 4041 return dev_ifname(net, (struct ifreq __user *)arg); 4042 4043 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 4044 return -EFAULT; 4045 4046 ifr.ifr_name[IFNAMSIZ-1] = 0; 4047 4048 colon = strchr(ifr.ifr_name, ':'); 4049 if (colon) 4050 *colon = 0; 4051 4052 /* 4053 * See which interface the caller is talking about. 4054 */ 4055 4056 switch (cmd) { 4057 /* 4058 * These ioctl calls: 4059 * - can be done by all. 4060 * - atomic and do not require locking. 4061 * - return a value 4062 */ 4063 case SIOCGIFFLAGS: 4064 case SIOCGIFMETRIC: 4065 case SIOCGIFMTU: 4066 case SIOCGIFHWADDR: 4067 case SIOCGIFSLAVE: 4068 case SIOCGIFMAP: 4069 case SIOCGIFINDEX: 4070 case SIOCGIFTXQLEN: 4071 dev_load(net, ifr.ifr_name); 4072 read_lock(&dev_base_lock); 4073 ret = dev_ifsioc_locked(net, &ifr, cmd); 4074 read_unlock(&dev_base_lock); 4075 if (!ret) { 4076 if (colon) 4077 *colon = ':'; 4078 if (copy_to_user(arg, &ifr, 4079 sizeof(struct ifreq))) 4080 ret = -EFAULT; 4081 } 4082 return ret; 4083 4084 case SIOCETHTOOL: 4085 dev_load(net, ifr.ifr_name); 4086 rtnl_lock(); 4087 ret = dev_ethtool(net, &ifr); 4088 rtnl_unlock(); 4089 if (!ret) { 4090 if (colon) 4091 *colon = ':'; 4092 if (copy_to_user(arg, &ifr, 4093 sizeof(struct ifreq))) 4094 ret = -EFAULT; 4095 } 4096 return ret; 4097 4098 /* 4099 * These ioctl calls: 4100 * - require superuser power. 4101 * - require strict serialization. 4102 * - return a value 4103 */ 4104 case SIOCGMIIPHY: 4105 case SIOCGMIIREG: 4106 case SIOCSIFNAME: 4107 if (!capable(CAP_NET_ADMIN)) 4108 return -EPERM; 4109 dev_load(net, ifr.ifr_name); 4110 rtnl_lock(); 4111 ret = dev_ifsioc(net, &ifr, cmd); 4112 rtnl_unlock(); 4113 if (!ret) { 4114 if (colon) 4115 *colon = ':'; 4116 if (copy_to_user(arg, &ifr, 4117 sizeof(struct ifreq))) 4118 ret = -EFAULT; 4119 } 4120 return ret; 4121 4122 /* 4123 * These ioctl calls: 4124 * - require superuser power. 4125 * - require strict serialization. 4126 * - do not return a value 4127 */ 4128 case SIOCSIFFLAGS: 4129 case SIOCSIFMETRIC: 4130 case SIOCSIFMTU: 4131 case SIOCSIFMAP: 4132 case SIOCSIFHWADDR: 4133 case SIOCSIFSLAVE: 4134 case SIOCADDMULTI: 4135 case SIOCDELMULTI: 4136 case SIOCSIFHWBROADCAST: 4137 case SIOCSIFTXQLEN: 4138 case SIOCSMIIREG: 4139 case SIOCBONDENSLAVE: 4140 case SIOCBONDRELEASE: 4141 case SIOCBONDSETHWADDR: 4142 case SIOCBONDCHANGEACTIVE: 4143 case SIOCBRADDIF: 4144 case SIOCBRDELIF: 4145 case SIOCSHWTSTAMP: 4146 if (!capable(CAP_NET_ADMIN)) 4147 return -EPERM; 4148 /* fall through */ 4149 case SIOCBONDSLAVEINFOQUERY: 4150 case SIOCBONDINFOQUERY: 4151 dev_load(net, ifr.ifr_name); 4152 rtnl_lock(); 4153 ret = dev_ifsioc(net, &ifr, cmd); 4154 rtnl_unlock(); 4155 return ret; 4156 4157 case SIOCGIFMEM: 4158 /* Get the per device memory space. We can add this but 4159 * currently do not support it */ 4160 case SIOCSIFMEM: 4161 /* Set the per device memory buffer space. 4162 * Not applicable in our case */ 4163 case SIOCSIFLINK: 4164 return -EINVAL; 4165 4166 /* 4167 * Unknown or private ioctl. 4168 */ 4169 default: 4170 if (cmd == SIOCWANDEV || 4171 (cmd >= SIOCDEVPRIVATE && 4172 cmd <= SIOCDEVPRIVATE + 15)) { 4173 dev_load(net, ifr.ifr_name); 4174 rtnl_lock(); 4175 ret = dev_ifsioc(net, &ifr, cmd); 4176 rtnl_unlock(); 4177 if (!ret && copy_to_user(arg, &ifr, 4178 sizeof(struct ifreq))) 4179 ret = -EFAULT; 4180 return ret; 4181 } 4182 /* Take care of Wireless Extensions */ 4183 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) 4184 return wext_handle_ioctl(net, &ifr, cmd, arg); 4185 return -EINVAL; 4186 } 4187 } 4188 4189 4190 /** 4191 * dev_new_index - allocate an ifindex 4192 * @net: the applicable net namespace 4193 * 4194 * Returns a suitable unique value for a new device interface 4195 * number. The caller must hold the rtnl semaphore or the 4196 * dev_base_lock to be sure it remains unique. 4197 */ 4198 static int dev_new_index(struct net *net) 4199 { 4200 static int ifindex; 4201 for (;;) { 4202 if (++ifindex <= 0) 4203 ifindex = 1; 4204 if (!__dev_get_by_index(net, ifindex)) 4205 return ifindex; 4206 } 4207 } 4208 4209 /* Delayed registration/unregisteration */ 4210 static LIST_HEAD(net_todo_list); 4211 4212 static void net_set_todo(struct net_device *dev) 4213 { 4214 list_add_tail(&dev->todo_list, &net_todo_list); 4215 } 4216 4217 static void rollback_registered(struct net_device *dev) 4218 { 4219 BUG_ON(dev_boot_phase); 4220 ASSERT_RTNL(); 4221 4222 /* Some devices call without registering for initialization unwind. */ 4223 if (dev->reg_state == NETREG_UNINITIALIZED) { 4224 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " 4225 "was registered\n", dev->name, dev); 4226 4227 WARN_ON(1); 4228 return; 4229 } 4230 4231 BUG_ON(dev->reg_state != NETREG_REGISTERED); 4232 4233 /* If device is running, close it first. */ 4234 dev_close(dev); 4235 4236 /* And unlink it from device chain. */ 4237 unlist_netdevice(dev); 4238 4239 dev->reg_state = NETREG_UNREGISTERING; 4240 4241 synchronize_net(); 4242 4243 /* Shutdown queueing discipline. */ 4244 dev_shutdown(dev); 4245 4246 4247 /* Notify protocols, that we are about to destroy 4248 this device. They should clean all the things. 4249 */ 4250 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4251 4252 /* 4253 * Flush the unicast and multicast chains 4254 */ 4255 dev_addr_discard(dev); 4256 4257 if (dev->netdev_ops->ndo_uninit) 4258 dev->netdev_ops->ndo_uninit(dev); 4259 4260 /* Notifier chain MUST detach us from master device. */ 4261 WARN_ON(dev->master); 4262 4263 /* Remove entries from kobject tree */ 4264 netdev_unregister_kobject(dev); 4265 4266 synchronize_net(); 4267 4268 dev_put(dev); 4269 } 4270 4271 static void __netdev_init_queue_locks_one(struct net_device *dev, 4272 struct netdev_queue *dev_queue, 4273 void *_unused) 4274 { 4275 spin_lock_init(&dev_queue->_xmit_lock); 4276 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); 4277 dev_queue->xmit_lock_owner = -1; 4278 } 4279 4280 static void netdev_init_queue_locks(struct net_device *dev) 4281 { 4282 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); 4283 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); 4284 } 4285 4286 unsigned long netdev_fix_features(unsigned long features, const char *name) 4287 { 4288 /* Fix illegal SG+CSUM combinations. */ 4289 if ((features & NETIF_F_SG) && 4290 !(features & NETIF_F_ALL_CSUM)) { 4291 if (name) 4292 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " 4293 "checksum feature.\n", name); 4294 features &= ~NETIF_F_SG; 4295 } 4296 4297 /* TSO requires that SG is present as well. */ 4298 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { 4299 if (name) 4300 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " 4301 "SG feature.\n", name); 4302 features &= ~NETIF_F_TSO; 4303 } 4304 4305 if (features & NETIF_F_UFO) { 4306 if (!(features & NETIF_F_GEN_CSUM)) { 4307 if (name) 4308 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 4309 "since no NETIF_F_HW_CSUM feature.\n", 4310 name); 4311 features &= ~NETIF_F_UFO; 4312 } 4313 4314 if (!(features & NETIF_F_SG)) { 4315 if (name) 4316 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 4317 "since no NETIF_F_SG feature.\n", name); 4318 features &= ~NETIF_F_UFO; 4319 } 4320 } 4321 4322 return features; 4323 } 4324 EXPORT_SYMBOL(netdev_fix_features); 4325 4326 /* Some devices need to (re-)set their netdev_ops inside 4327 * ->init() or similar. If that happens, we have to setup 4328 * the compat pointers again. 4329 */ 4330 void netdev_resync_ops(struct net_device *dev) 4331 { 4332 #ifdef CONFIG_COMPAT_NET_DEV_OPS 4333 const struct net_device_ops *ops = dev->netdev_ops; 4334 4335 dev->init = ops->ndo_init; 4336 dev->uninit = ops->ndo_uninit; 4337 dev->open = ops->ndo_open; 4338 dev->change_rx_flags = ops->ndo_change_rx_flags; 4339 dev->set_rx_mode = ops->ndo_set_rx_mode; 4340 dev->set_multicast_list = ops->ndo_set_multicast_list; 4341 dev->set_mac_address = ops->ndo_set_mac_address; 4342 dev->validate_addr = ops->ndo_validate_addr; 4343 dev->do_ioctl = ops->ndo_do_ioctl; 4344 dev->set_config = ops->ndo_set_config; 4345 dev->change_mtu = ops->ndo_change_mtu; 4346 dev->neigh_setup = ops->ndo_neigh_setup; 4347 dev->tx_timeout = ops->ndo_tx_timeout; 4348 dev->get_stats = ops->ndo_get_stats; 4349 dev->vlan_rx_register = ops->ndo_vlan_rx_register; 4350 dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; 4351 dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; 4352 #ifdef CONFIG_NET_POLL_CONTROLLER 4353 dev->poll_controller = ops->ndo_poll_controller; 4354 #endif 4355 #endif 4356 } 4357 EXPORT_SYMBOL(netdev_resync_ops); 4358 4359 /** 4360 * register_netdevice - register a network device 4361 * @dev: device to register 4362 * 4363 * Take a completed network device structure and add it to the kernel 4364 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 4365 * chain. 0 is returned on success. A negative errno code is returned 4366 * on a failure to set up the device, or if the name is a duplicate. 4367 * 4368 * Callers must hold the rtnl semaphore. You may want 4369 * register_netdev() instead of this. 4370 * 4371 * BUGS: 4372 * The locking appears insufficient to guarantee two parallel registers 4373 * will not get the same name. 4374 */ 4375 4376 int register_netdevice(struct net_device *dev) 4377 { 4378 struct hlist_head *head; 4379 struct hlist_node *p; 4380 int ret; 4381 struct net *net = dev_net(dev); 4382 4383 BUG_ON(dev_boot_phase); 4384 ASSERT_RTNL(); 4385 4386 might_sleep(); 4387 4388 /* When net_device's are persistent, this will be fatal. */ 4389 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 4390 BUG_ON(!net); 4391 4392 spin_lock_init(&dev->addr_list_lock); 4393 netdev_set_addr_lockdep_class(dev); 4394 netdev_init_queue_locks(dev); 4395 4396 dev->iflink = -1; 4397 4398 #ifdef CONFIG_COMPAT_NET_DEV_OPS 4399 /* Netdevice_ops API compatiability support. 4400 * This is temporary until all network devices are converted. 4401 */ 4402 if (dev->netdev_ops) { 4403 netdev_resync_ops(dev); 4404 } else { 4405 char drivername[64]; 4406 pr_info("%s (%s): not using net_device_ops yet\n", 4407 dev->name, netdev_drivername(dev, drivername, 64)); 4408 4409 /* This works only because net_device_ops and the 4410 compatiablity structure are the same. */ 4411 dev->netdev_ops = (void *) &(dev->init); 4412 } 4413 #endif 4414 4415 /* Init, if this function is available */ 4416 if (dev->netdev_ops->ndo_init) { 4417 ret = dev->netdev_ops->ndo_init(dev); 4418 if (ret) { 4419 if (ret > 0) 4420 ret = -EIO; 4421 goto out; 4422 } 4423 } 4424 4425 if (!dev_valid_name(dev->name)) { 4426 ret = -EINVAL; 4427 goto err_uninit; 4428 } 4429 4430 dev->ifindex = dev_new_index(net); 4431 if (dev->iflink == -1) 4432 dev->iflink = dev->ifindex; 4433 4434 /* Check for existence of name */ 4435 head = dev_name_hash(net, dev->name); 4436 hlist_for_each(p, head) { 4437 struct net_device *d 4438 = hlist_entry(p, struct net_device, name_hlist); 4439 if (!strncmp(d->name, dev->name, IFNAMSIZ)) { 4440 ret = -EEXIST; 4441 goto err_uninit; 4442 } 4443 } 4444 4445 /* Fix illegal checksum combinations */ 4446 if ((dev->features & NETIF_F_HW_CSUM) && 4447 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 4448 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", 4449 dev->name); 4450 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 4451 } 4452 4453 if ((dev->features & NETIF_F_NO_CSUM) && 4454 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 4455 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", 4456 dev->name); 4457 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); 4458 } 4459 4460 dev->features = netdev_fix_features(dev->features, dev->name); 4461 4462 /* Enable software GSO if SG is supported. */ 4463 if (dev->features & NETIF_F_SG) 4464 dev->features |= NETIF_F_GSO; 4465 4466 netdev_initialize_kobject(dev); 4467 ret = netdev_register_kobject(dev); 4468 if (ret) 4469 goto err_uninit; 4470 dev->reg_state = NETREG_REGISTERED; 4471 4472 /* 4473 * Default initial state at registry is that the 4474 * device is present. 4475 */ 4476 4477 set_bit(__LINK_STATE_PRESENT, &dev->state); 4478 4479 dev_init_scheduler(dev); 4480 dev_hold(dev); 4481 list_netdevice(dev); 4482 4483 /* Notify protocols, that a new device appeared. */ 4484 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 4485 ret = notifier_to_errno(ret); 4486 if (ret) { 4487 rollback_registered(dev); 4488 dev->reg_state = NETREG_UNREGISTERED; 4489 } 4490 4491 out: 4492 return ret; 4493 4494 err_uninit: 4495 if (dev->netdev_ops->ndo_uninit) 4496 dev->netdev_ops->ndo_uninit(dev); 4497 goto out; 4498 } 4499 4500 /** 4501 * init_dummy_netdev - init a dummy network device for NAPI 4502 * @dev: device to init 4503 * 4504 * This takes a network device structure and initialize the minimum 4505 * amount of fields so it can be used to schedule NAPI polls without 4506 * registering a full blown interface. This is to be used by drivers 4507 * that need to tie several hardware interfaces to a single NAPI 4508 * poll scheduler due to HW limitations. 4509 */ 4510 int init_dummy_netdev(struct net_device *dev) 4511 { 4512 /* Clear everything. Note we don't initialize spinlocks 4513 * are they aren't supposed to be taken by any of the 4514 * NAPI code and this dummy netdev is supposed to be 4515 * only ever used for NAPI polls 4516 */ 4517 memset(dev, 0, sizeof(struct net_device)); 4518 4519 /* make sure we BUG if trying to hit standard 4520 * register/unregister code path 4521 */ 4522 dev->reg_state = NETREG_DUMMY; 4523 4524 /* initialize the ref count */ 4525 atomic_set(&dev->refcnt, 1); 4526 4527 /* NAPI wants this */ 4528 INIT_LIST_HEAD(&dev->napi_list); 4529 4530 /* a dummy interface is started by default */ 4531 set_bit(__LINK_STATE_PRESENT, &dev->state); 4532 set_bit(__LINK_STATE_START, &dev->state); 4533 4534 return 0; 4535 } 4536 EXPORT_SYMBOL_GPL(init_dummy_netdev); 4537 4538 4539 /** 4540 * register_netdev - register a network device 4541 * @dev: device to register 4542 * 4543 * Take a completed network device structure and add it to the kernel 4544 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 4545 * chain. 0 is returned on success. A negative errno code is returned 4546 * on a failure to set up the device, or if the name is a duplicate. 4547 * 4548 * This is a wrapper around register_netdevice that takes the rtnl semaphore 4549 * and expands the device name if you passed a format string to 4550 * alloc_netdev. 4551 */ 4552 int register_netdev(struct net_device *dev) 4553 { 4554 int err; 4555 4556 rtnl_lock(); 4557 4558 /* 4559 * If the name is a format string the caller wants us to do a 4560 * name allocation. 4561 */ 4562 if (strchr(dev->name, '%')) { 4563 err = dev_alloc_name(dev, dev->name); 4564 if (err < 0) 4565 goto out; 4566 } 4567 4568 err = register_netdevice(dev); 4569 out: 4570 rtnl_unlock(); 4571 return err; 4572 } 4573 EXPORT_SYMBOL(register_netdev); 4574 4575 /* 4576 * netdev_wait_allrefs - wait until all references are gone. 4577 * 4578 * This is called when unregistering network devices. 4579 * 4580 * Any protocol or device that holds a reference should register 4581 * for netdevice notification, and cleanup and put back the 4582 * reference if they receive an UNREGISTER event. 4583 * We can get stuck here if buggy protocols don't correctly 4584 * call dev_put. 4585 */ 4586 static void netdev_wait_allrefs(struct net_device *dev) 4587 { 4588 unsigned long rebroadcast_time, warning_time; 4589 4590 rebroadcast_time = warning_time = jiffies; 4591 while (atomic_read(&dev->refcnt) != 0) { 4592 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 4593 rtnl_lock(); 4594 4595 /* Rebroadcast unregister notification */ 4596 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4597 4598 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 4599 &dev->state)) { 4600 /* We must not have linkwatch events 4601 * pending on unregister. If this 4602 * happens, we simply run the queue 4603 * unscheduled, resulting in a noop 4604 * for this device. 4605 */ 4606 linkwatch_run_queue(); 4607 } 4608 4609 __rtnl_unlock(); 4610 4611 rebroadcast_time = jiffies; 4612 } 4613 4614 msleep(250); 4615 4616 if (time_after(jiffies, warning_time + 10 * HZ)) { 4617 printk(KERN_EMERG "unregister_netdevice: " 4618 "waiting for %s to become free. Usage " 4619 "count = %d\n", 4620 dev->name, atomic_read(&dev->refcnt)); 4621 warning_time = jiffies; 4622 } 4623 } 4624 } 4625 4626 /* The sequence is: 4627 * 4628 * rtnl_lock(); 4629 * ... 4630 * register_netdevice(x1); 4631 * register_netdevice(x2); 4632 * ... 4633 * unregister_netdevice(y1); 4634 * unregister_netdevice(y2); 4635 * ... 4636 * rtnl_unlock(); 4637 * free_netdev(y1); 4638 * free_netdev(y2); 4639 * 4640 * We are invoked by rtnl_unlock(). 4641 * This allows us to deal with problems: 4642 * 1) We can delete sysfs objects which invoke hotplug 4643 * without deadlocking with linkwatch via keventd. 4644 * 2) Since we run with the RTNL semaphore not held, we can sleep 4645 * safely in order to wait for the netdev refcnt to drop to zero. 4646 * 4647 * We must not return until all unregister events added during 4648 * the interval the lock was held have been completed. 4649 */ 4650 void netdev_run_todo(void) 4651 { 4652 struct list_head list; 4653 4654 /* Snapshot list, allow later requests */ 4655 list_replace_init(&net_todo_list, &list); 4656 4657 __rtnl_unlock(); 4658 4659 while (!list_empty(&list)) { 4660 struct net_device *dev 4661 = list_entry(list.next, struct net_device, todo_list); 4662 list_del(&dev->todo_list); 4663 4664 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 4665 printk(KERN_ERR "network todo '%s' but state %d\n", 4666 dev->name, dev->reg_state); 4667 dump_stack(); 4668 continue; 4669 } 4670 4671 dev->reg_state = NETREG_UNREGISTERED; 4672 4673 on_each_cpu(flush_backlog, dev, 1); 4674 4675 netdev_wait_allrefs(dev); 4676 4677 /* paranoia */ 4678 BUG_ON(atomic_read(&dev->refcnt)); 4679 WARN_ON(dev->ip_ptr); 4680 WARN_ON(dev->ip6_ptr); 4681 WARN_ON(dev->dn_ptr); 4682 4683 if (dev->destructor) 4684 dev->destructor(dev); 4685 4686 /* Free network device */ 4687 kobject_put(&dev->dev.kobj); 4688 } 4689 } 4690 4691 /** 4692 * dev_get_stats - get network device statistics 4693 * @dev: device to get statistics from 4694 * 4695 * Get network statistics from device. The device driver may provide 4696 * its own method by setting dev->netdev_ops->get_stats; otherwise 4697 * the internal statistics structure is used. 4698 */ 4699 const struct net_device_stats *dev_get_stats(struct net_device *dev) 4700 { 4701 const struct net_device_ops *ops = dev->netdev_ops; 4702 4703 if (ops->ndo_get_stats) 4704 return ops->ndo_get_stats(dev); 4705 else 4706 return &dev->stats; 4707 } 4708 EXPORT_SYMBOL(dev_get_stats); 4709 4710 static void netdev_init_one_queue(struct net_device *dev, 4711 struct netdev_queue *queue, 4712 void *_unused) 4713 { 4714 queue->dev = dev; 4715 } 4716 4717 static void netdev_init_queues(struct net_device *dev) 4718 { 4719 netdev_init_one_queue(dev, &dev->rx_queue, NULL); 4720 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 4721 spin_lock_init(&dev->tx_global_lock); 4722 } 4723 4724 /** 4725 * alloc_netdev_mq - allocate network device 4726 * @sizeof_priv: size of private data to allocate space for 4727 * @name: device name format string 4728 * @setup: callback to initialize device 4729 * @queue_count: the number of subqueues to allocate 4730 * 4731 * Allocates a struct net_device with private data area for driver use 4732 * and performs basic initialization. Also allocates subquue structs 4733 * for each queue on the device at the end of the netdevice. 4734 */ 4735 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, 4736 void (*setup)(struct net_device *), unsigned int queue_count) 4737 { 4738 struct netdev_queue *tx; 4739 struct net_device *dev; 4740 size_t alloc_size; 4741 void *p; 4742 4743 BUG_ON(strlen(name) >= sizeof(dev->name)); 4744 4745 alloc_size = sizeof(struct net_device); 4746 if (sizeof_priv) { 4747 /* ensure 32-byte alignment of private area */ 4748 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; 4749 alloc_size += sizeof_priv; 4750 } 4751 /* ensure 32-byte alignment of whole construct */ 4752 alloc_size += NETDEV_ALIGN_CONST; 4753 4754 p = kzalloc(alloc_size, GFP_KERNEL); 4755 if (!p) { 4756 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); 4757 return NULL; 4758 } 4759 4760 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); 4761 if (!tx) { 4762 printk(KERN_ERR "alloc_netdev: Unable to allocate " 4763 "tx qdiscs.\n"); 4764 kfree(p); 4765 return NULL; 4766 } 4767 4768 dev = (struct net_device *) 4769 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); 4770 dev->padded = (char *)dev - (char *)p; 4771 dev_net_set(dev, &init_net); 4772 4773 dev->_tx = tx; 4774 dev->num_tx_queues = queue_count; 4775 dev->real_num_tx_queues = queue_count; 4776 4777 dev->gso_max_size = GSO_MAX_SIZE; 4778 4779 netdev_init_queues(dev); 4780 4781 INIT_LIST_HEAD(&dev->napi_list); 4782 setup(dev); 4783 strcpy(dev->name, name); 4784 return dev; 4785 } 4786 EXPORT_SYMBOL(alloc_netdev_mq); 4787 4788 /** 4789 * free_netdev - free network device 4790 * @dev: device 4791 * 4792 * This function does the last stage of destroying an allocated device 4793 * interface. The reference to the device object is released. 4794 * If this is the last reference then it will be freed. 4795 */ 4796 void free_netdev(struct net_device *dev) 4797 { 4798 struct napi_struct *p, *n; 4799 4800 release_net(dev_net(dev)); 4801 4802 kfree(dev->_tx); 4803 4804 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 4805 netif_napi_del(p); 4806 4807 /* Compatibility with error handling in drivers */ 4808 if (dev->reg_state == NETREG_UNINITIALIZED) { 4809 kfree((char *)dev - dev->padded); 4810 return; 4811 } 4812 4813 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 4814 dev->reg_state = NETREG_RELEASED; 4815 4816 /* will free via device release */ 4817 put_device(&dev->dev); 4818 } 4819 4820 /** 4821 * synchronize_net - Synchronize with packet receive processing 4822 * 4823 * Wait for packets currently being received to be done. 4824 * Does not block later packets from starting. 4825 */ 4826 void synchronize_net(void) 4827 { 4828 might_sleep(); 4829 synchronize_rcu(); 4830 } 4831 4832 /** 4833 * unregister_netdevice - remove device from the kernel 4834 * @dev: device 4835 * 4836 * This function shuts down a device interface and removes it 4837 * from the kernel tables. 4838 * 4839 * Callers must hold the rtnl semaphore. You may want 4840 * unregister_netdev() instead of this. 4841 */ 4842 4843 void unregister_netdevice(struct net_device *dev) 4844 { 4845 ASSERT_RTNL(); 4846 4847 rollback_registered(dev); 4848 /* Finish processing unregister after unlock */ 4849 net_set_todo(dev); 4850 } 4851 4852 /** 4853 * unregister_netdev - remove device from the kernel 4854 * @dev: device 4855 * 4856 * This function shuts down a device interface and removes it 4857 * from the kernel tables. 4858 * 4859 * This is just a wrapper for unregister_netdevice that takes 4860 * the rtnl semaphore. In general you want to use this and not 4861 * unregister_netdevice. 4862 */ 4863 void unregister_netdev(struct net_device *dev) 4864 { 4865 rtnl_lock(); 4866 unregister_netdevice(dev); 4867 rtnl_unlock(); 4868 } 4869 4870 EXPORT_SYMBOL(unregister_netdev); 4871 4872 /** 4873 * dev_change_net_namespace - move device to different nethost namespace 4874 * @dev: device 4875 * @net: network namespace 4876 * @pat: If not NULL name pattern to try if the current device name 4877 * is already taken in the destination network namespace. 4878 * 4879 * This function shuts down a device interface and moves it 4880 * to a new network namespace. On success 0 is returned, on 4881 * a failure a netagive errno code is returned. 4882 * 4883 * Callers must hold the rtnl semaphore. 4884 */ 4885 4886 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 4887 { 4888 char buf[IFNAMSIZ]; 4889 const char *destname; 4890 int err; 4891 4892 ASSERT_RTNL(); 4893 4894 /* Don't allow namespace local devices to be moved. */ 4895 err = -EINVAL; 4896 if (dev->features & NETIF_F_NETNS_LOCAL) 4897 goto out; 4898 4899 #ifdef CONFIG_SYSFS 4900 /* Don't allow real devices to be moved when sysfs 4901 * is enabled. 4902 */ 4903 err = -EINVAL; 4904 if (dev->dev.parent) 4905 goto out; 4906 #endif 4907 4908 /* Ensure the device has been registrered */ 4909 err = -EINVAL; 4910 if (dev->reg_state != NETREG_REGISTERED) 4911 goto out; 4912 4913 /* Get out if there is nothing todo */ 4914 err = 0; 4915 if (net_eq(dev_net(dev), net)) 4916 goto out; 4917 4918 /* Pick the destination device name, and ensure 4919 * we can use it in the destination network namespace. 4920 */ 4921 err = -EEXIST; 4922 destname = dev->name; 4923 if (__dev_get_by_name(net, destname)) { 4924 /* We get here if we can't use the current device name */ 4925 if (!pat) 4926 goto out; 4927 if (!dev_valid_name(pat)) 4928 goto out; 4929 if (strchr(pat, '%')) { 4930 if (__dev_alloc_name(net, pat, buf) < 0) 4931 goto out; 4932 destname = buf; 4933 } else 4934 destname = pat; 4935 if (__dev_get_by_name(net, destname)) 4936 goto out; 4937 } 4938 4939 /* 4940 * And now a mini version of register_netdevice unregister_netdevice. 4941 */ 4942 4943 /* If device is running close it first. */ 4944 dev_close(dev); 4945 4946 /* And unlink it from device chain */ 4947 err = -ENODEV; 4948 unlist_netdevice(dev); 4949 4950 synchronize_net(); 4951 4952 /* Shutdown queueing discipline. */ 4953 dev_shutdown(dev); 4954 4955 /* Notify protocols, that we are about to destroy 4956 this device. They should clean all the things. 4957 */ 4958 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4959 4960 /* 4961 * Flush the unicast and multicast chains 4962 */ 4963 dev_addr_discard(dev); 4964 4965 netdev_unregister_kobject(dev); 4966 4967 /* Actually switch the network namespace */ 4968 dev_net_set(dev, net); 4969 4970 /* Assign the new device name */ 4971 if (destname != dev->name) 4972 strcpy(dev->name, destname); 4973 4974 /* If there is an ifindex conflict assign a new one */ 4975 if (__dev_get_by_index(net, dev->ifindex)) { 4976 int iflink = (dev->iflink == dev->ifindex); 4977 dev->ifindex = dev_new_index(net); 4978 if (iflink) 4979 dev->iflink = dev->ifindex; 4980 } 4981 4982 /* Fixup kobjects */ 4983 err = netdev_register_kobject(dev); 4984 WARN_ON(err); 4985 4986 /* Add the device back in the hashes */ 4987 list_netdevice(dev); 4988 4989 /* Notify protocols, that a new device appeared. */ 4990 call_netdevice_notifiers(NETDEV_REGISTER, dev); 4991 4992 synchronize_net(); 4993 err = 0; 4994 out: 4995 return err; 4996 } 4997 4998 static int dev_cpu_callback(struct notifier_block *nfb, 4999 unsigned long action, 5000 void *ocpu) 5001 { 5002 struct sk_buff **list_skb; 5003 struct Qdisc **list_net; 5004 struct sk_buff *skb; 5005 unsigned int cpu, oldcpu = (unsigned long)ocpu; 5006 struct softnet_data *sd, *oldsd; 5007 5008 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 5009 return NOTIFY_OK; 5010 5011 local_irq_disable(); 5012 cpu = smp_processor_id(); 5013 sd = &per_cpu(softnet_data, cpu); 5014 oldsd = &per_cpu(softnet_data, oldcpu); 5015 5016 /* Find end of our completion_queue. */ 5017 list_skb = &sd->completion_queue; 5018 while (*list_skb) 5019 list_skb = &(*list_skb)->next; 5020 /* Append completion queue from offline CPU. */ 5021 *list_skb = oldsd->completion_queue; 5022 oldsd->completion_queue = NULL; 5023 5024 /* Find end of our output_queue. */ 5025 list_net = &sd->output_queue; 5026 while (*list_net) 5027 list_net = &(*list_net)->next_sched; 5028 /* Append output queue from offline CPU. */ 5029 *list_net = oldsd->output_queue; 5030 oldsd->output_queue = NULL; 5031 5032 raise_softirq_irqoff(NET_TX_SOFTIRQ); 5033 local_irq_enable(); 5034 5035 /* Process offline CPU's input_pkt_queue */ 5036 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) 5037 netif_rx(skb); 5038 5039 return NOTIFY_OK; 5040 } 5041 5042 5043 /** 5044 * netdev_increment_features - increment feature set by one 5045 * @all: current feature set 5046 * @one: new feature set 5047 * @mask: mask feature set 5048 * 5049 * Computes a new feature set after adding a device with feature set 5050 * @one to the master device with current feature set @all. Will not 5051 * enable anything that is off in @mask. Returns the new feature set. 5052 */ 5053 unsigned long netdev_increment_features(unsigned long all, unsigned long one, 5054 unsigned long mask) 5055 { 5056 /* If device needs checksumming, downgrade to it. */ 5057 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) 5058 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); 5059 else if (mask & NETIF_F_ALL_CSUM) { 5060 /* If one device supports v4/v6 checksumming, set for all. */ 5061 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && 5062 !(all & NETIF_F_GEN_CSUM)) { 5063 all &= ~NETIF_F_ALL_CSUM; 5064 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); 5065 } 5066 5067 /* If one device supports hw checksumming, set for all. */ 5068 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { 5069 all &= ~NETIF_F_ALL_CSUM; 5070 all |= NETIF_F_HW_CSUM; 5071 } 5072 } 5073 5074 one |= NETIF_F_ALL_CSUM; 5075 5076 one |= all & NETIF_F_ONE_FOR_ALL; 5077 all &= one | NETIF_F_LLTX | NETIF_F_GSO; 5078 all |= one & mask & NETIF_F_ONE_FOR_ALL; 5079 5080 return all; 5081 } 5082 EXPORT_SYMBOL(netdev_increment_features); 5083 5084 static struct hlist_head *netdev_create_hash(void) 5085 { 5086 int i; 5087 struct hlist_head *hash; 5088 5089 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 5090 if (hash != NULL) 5091 for (i = 0; i < NETDEV_HASHENTRIES; i++) 5092 INIT_HLIST_HEAD(&hash[i]); 5093 5094 return hash; 5095 } 5096 5097 /* Initialize per network namespace state */ 5098 static int __net_init netdev_init(struct net *net) 5099 { 5100 INIT_LIST_HEAD(&net->dev_base_head); 5101 5102 net->dev_name_head = netdev_create_hash(); 5103 if (net->dev_name_head == NULL) 5104 goto err_name; 5105 5106 net->dev_index_head = netdev_create_hash(); 5107 if (net->dev_index_head == NULL) 5108 goto err_idx; 5109 5110 return 0; 5111 5112 err_idx: 5113 kfree(net->dev_name_head); 5114 err_name: 5115 return -ENOMEM; 5116 } 5117 5118 /** 5119 * netdev_drivername - network driver for the device 5120 * @dev: network device 5121 * @buffer: buffer for resulting name 5122 * @len: size of buffer 5123 * 5124 * Determine network driver for device. 5125 */ 5126 char *netdev_drivername(const struct net_device *dev, char *buffer, int len) 5127 { 5128 const struct device_driver *driver; 5129 const struct device *parent; 5130 5131 if (len <= 0 || !buffer) 5132 return buffer; 5133 buffer[0] = 0; 5134 5135 parent = dev->dev.parent; 5136 5137 if (!parent) 5138 return buffer; 5139 5140 driver = parent->driver; 5141 if (driver && driver->name) 5142 strlcpy(buffer, driver->name, len); 5143 return buffer; 5144 } 5145 5146 static void __net_exit netdev_exit(struct net *net) 5147 { 5148 kfree(net->dev_name_head); 5149 kfree(net->dev_index_head); 5150 } 5151 5152 static struct pernet_operations __net_initdata netdev_net_ops = { 5153 .init = netdev_init, 5154 .exit = netdev_exit, 5155 }; 5156 5157 static void __net_exit default_device_exit(struct net *net) 5158 { 5159 struct net_device *dev; 5160 /* 5161 * Push all migratable of the network devices back to the 5162 * initial network namespace 5163 */ 5164 rtnl_lock(); 5165 restart: 5166 for_each_netdev(net, dev) { 5167 int err; 5168 char fb_name[IFNAMSIZ]; 5169 5170 /* Ignore unmoveable devices (i.e. loopback) */ 5171 if (dev->features & NETIF_F_NETNS_LOCAL) 5172 continue; 5173 5174 /* Delete virtual devices */ 5175 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { 5176 dev->rtnl_link_ops->dellink(dev); 5177 goto restart; 5178 } 5179 5180 /* Push remaing network devices to init_net */ 5181 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 5182 err = dev_change_net_namespace(dev, &init_net, fb_name); 5183 if (err) { 5184 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", 5185 __func__, dev->name, err); 5186 BUG(); 5187 } 5188 goto restart; 5189 } 5190 rtnl_unlock(); 5191 } 5192 5193 static struct pernet_operations __net_initdata default_device_ops = { 5194 .exit = default_device_exit, 5195 }; 5196 5197 /* 5198 * Initialize the DEV module. At boot time this walks the device list and 5199 * unhooks any devices that fail to initialise (normally hardware not 5200 * present) and leaves us with a valid list of present and active devices. 5201 * 5202 */ 5203 5204 /* 5205 * This is called single threaded during boot, so no need 5206 * to take the rtnl semaphore. 5207 */ 5208 static int __init net_dev_init(void) 5209 { 5210 int i, rc = -ENOMEM; 5211 5212 BUG_ON(!dev_boot_phase); 5213 5214 if (dev_proc_init()) 5215 goto out; 5216 5217 if (netdev_kobject_init()) 5218 goto out; 5219 5220 INIT_LIST_HEAD(&ptype_all); 5221 for (i = 0; i < PTYPE_HASH_SIZE; i++) 5222 INIT_LIST_HEAD(&ptype_base[i]); 5223 5224 if (register_pernet_subsys(&netdev_net_ops)) 5225 goto out; 5226 5227 /* 5228 * Initialise the packet receive queues. 5229 */ 5230 5231 for_each_possible_cpu(i) { 5232 struct softnet_data *queue; 5233 5234 queue = &per_cpu(softnet_data, i); 5235 skb_queue_head_init(&queue->input_pkt_queue); 5236 queue->completion_queue = NULL; 5237 INIT_LIST_HEAD(&queue->poll_list); 5238 5239 queue->backlog.poll = process_backlog; 5240 queue->backlog.weight = weight_p; 5241 queue->backlog.gro_list = NULL; 5242 queue->backlog.gro_count = 0; 5243 } 5244 5245 dev_boot_phase = 0; 5246 5247 /* The loopback device is special if any other network devices 5248 * is present in a network namespace the loopback device must 5249 * be present. Since we now dynamically allocate and free the 5250 * loopback device ensure this invariant is maintained by 5251 * keeping the loopback device as the first device on the 5252 * list of network devices. Ensuring the loopback devices 5253 * is the first device that appears and the last network device 5254 * that disappears. 5255 */ 5256 if (register_pernet_device(&loopback_net_ops)) 5257 goto out; 5258 5259 if (register_pernet_device(&default_device_ops)) 5260 goto out; 5261 5262 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 5263 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 5264 5265 hotcpu_notifier(dev_cpu_callback, 0); 5266 dst_init(); 5267 dev_mcast_init(); 5268 rc = 0; 5269 out: 5270 return rc; 5271 } 5272 5273 subsys_initcall(net_dev_init); 5274 5275 static int __init initialize_hashrnd(void) 5276 { 5277 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); 5278 return 0; 5279 } 5280 5281 late_initcall_sync(initialize_hashrnd); 5282 5283 EXPORT_SYMBOL(__dev_get_by_index); 5284 EXPORT_SYMBOL(__dev_get_by_name); 5285 EXPORT_SYMBOL(__dev_remove_pack); 5286 EXPORT_SYMBOL(dev_valid_name); 5287 EXPORT_SYMBOL(dev_add_pack); 5288 EXPORT_SYMBOL(dev_alloc_name); 5289 EXPORT_SYMBOL(dev_close); 5290 EXPORT_SYMBOL(dev_get_by_flags); 5291 EXPORT_SYMBOL(dev_get_by_index); 5292 EXPORT_SYMBOL(dev_get_by_name); 5293 EXPORT_SYMBOL(dev_open); 5294 EXPORT_SYMBOL(dev_queue_xmit); 5295 EXPORT_SYMBOL(dev_remove_pack); 5296 EXPORT_SYMBOL(dev_set_allmulti); 5297 EXPORT_SYMBOL(dev_set_promiscuity); 5298 EXPORT_SYMBOL(dev_change_flags); 5299 EXPORT_SYMBOL(dev_set_mtu); 5300 EXPORT_SYMBOL(dev_set_mac_address); 5301 EXPORT_SYMBOL(free_netdev); 5302 EXPORT_SYMBOL(netdev_boot_setup_check); 5303 EXPORT_SYMBOL(netdev_set_master); 5304 EXPORT_SYMBOL(netdev_state_change); 5305 EXPORT_SYMBOL(netif_receive_skb); 5306 EXPORT_SYMBOL(netif_rx); 5307 EXPORT_SYMBOL(register_gifconf); 5308 EXPORT_SYMBOL(register_netdevice); 5309 EXPORT_SYMBOL(register_netdevice_notifier); 5310 EXPORT_SYMBOL(skb_checksum_help); 5311 EXPORT_SYMBOL(synchronize_net); 5312 EXPORT_SYMBOL(unregister_netdevice); 5313 EXPORT_SYMBOL(unregister_netdevice_notifier); 5314 EXPORT_SYMBOL(net_enable_timestamp); 5315 EXPORT_SYMBOL(net_disable_timestamp); 5316 EXPORT_SYMBOL(dev_get_flags); 5317 5318 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) 5319 EXPORT_SYMBOL(br_handle_frame_hook); 5320 EXPORT_SYMBOL(br_fdb_get_hook); 5321 EXPORT_SYMBOL(br_fdb_put_hook); 5322 #endif 5323 5324 EXPORT_SYMBOL(dev_load); 5325 5326 EXPORT_PER_CPU_SYMBOL(softnet_data); 5327