1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <asm/system.h> 77 #include <linux/bitops.h> 78 #include <linux/capability.h> 79 #include <linux/cpu.h> 80 #include <linux/types.h> 81 #include <linux/kernel.h> 82 #include <linux/sched.h> 83 #include <linux/mutex.h> 84 #include <linux/string.h> 85 #include <linux/mm.h> 86 #include <linux/socket.h> 87 #include <linux/sockios.h> 88 #include <linux/errno.h> 89 #include <linux/interrupt.h> 90 #include <linux/if_ether.h> 91 #include <linux/netdevice.h> 92 #include <linux/etherdevice.h> 93 #include <linux/ethtool.h> 94 #include <linux/notifier.h> 95 #include <linux/skbuff.h> 96 #include <net/net_namespace.h> 97 #include <net/sock.h> 98 #include <linux/rtnetlink.h> 99 #include <linux/proc_fs.h> 100 #include <linux/seq_file.h> 101 #include <linux/stat.h> 102 #include <linux/if_bridge.h> 103 #include <linux/if_macvlan.h> 104 #include <net/dst.h> 105 #include <net/pkt_sched.h> 106 #include <net/checksum.h> 107 #include <linux/highmem.h> 108 #include <linux/init.h> 109 #include <linux/kmod.h> 110 #include <linux/module.h> 111 #include <linux/kallsyms.h> 112 #include <linux/netpoll.h> 113 #include <linux/rcupdate.h> 114 #include <linux/delay.h> 115 #include <net/wext.h> 116 #include <net/iw_handler.h> 117 #include <asm/current.h> 118 #include <linux/audit.h> 119 #include <linux/dmaengine.h> 120 #include <linux/err.h> 121 #include <linux/ctype.h> 122 #include <linux/if_arp.h> 123 #include <linux/if_vlan.h> 124 #include <linux/ip.h> 125 #include <linux/ipv6.h> 126 #include <linux/in.h> 127 #include <linux/jhash.h> 128 #include <linux/random.h> 129 130 #include "net-sysfs.h" 131 132 /* 133 * The list of packet types we will receive (as opposed to discard) 134 * and the routines to invoke. 135 * 136 * Why 16. Because with 16 the only overlap we get on a hash of the 137 * low nibble of the protocol value is RARP/SNAP/X.25. 138 * 139 * NOTE: That is no longer true with the addition of VLAN tags. Not 140 * sure which should go first, but I bet it won't make much 141 * difference if we are running VLANs. The good news is that 142 * this protocol won't be in the list unless compiled in, so 143 * the average user (w/out VLANs) will not be adversely affected. 144 * --BLG 145 * 146 * 0800 IP 147 * 8100 802.1Q VLAN 148 * 0001 802.3 149 * 0002 AX.25 150 * 0004 802.2 151 * 8035 RARP 152 * 0005 SNAP 153 * 0805 X.25 154 * 0806 ARP 155 * 8137 IPX 156 * 0009 Localtalk 157 * 86DD IPv6 158 */ 159 160 #define PTYPE_HASH_SIZE (16) 161 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) 162 163 static DEFINE_SPINLOCK(ptype_lock); 164 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 165 static struct list_head ptype_all __read_mostly; /* Taps */ 166 167 #ifdef CONFIG_NET_DMA 168 struct net_dma { 169 struct dma_client client; 170 spinlock_t lock; 171 cpumask_t channel_mask; 172 struct dma_chan **channels; 173 }; 174 175 static enum dma_state_client 176 netdev_dma_event(struct dma_client *client, struct dma_chan *chan, 177 enum dma_state state); 178 179 static struct net_dma net_dma = { 180 .client = { 181 .event_callback = netdev_dma_event, 182 }, 183 }; 184 #endif 185 186 /* 187 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 188 * semaphore. 189 * 190 * Pure readers hold dev_base_lock for reading. 191 * 192 * Writers must hold the rtnl semaphore while they loop through the 193 * dev_base_head list, and hold dev_base_lock for writing when they do the 194 * actual updates. This allows pure readers to access the list even 195 * while a writer is preparing to update it. 196 * 197 * To put it another way, dev_base_lock is held for writing only to 198 * protect against pure readers; the rtnl semaphore provides the 199 * protection against other writers. 200 * 201 * See, for example usages, register_netdevice() and 202 * unregister_netdevice(), which must be called with the rtnl 203 * semaphore held. 204 */ 205 DEFINE_RWLOCK(dev_base_lock); 206 207 EXPORT_SYMBOL(dev_base_lock); 208 209 #define NETDEV_HASHBITS 8 210 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) 211 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 213 { 214 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 215 return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; 216 } 217 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 219 { 220 return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; 221 } 222 223 /* Device list insertion */ 224 static int list_netdevice(struct net_device *dev) 225 { 226 struct net *net = dev_net(dev); 227 228 ASSERT_RTNL(); 229 230 write_lock_bh(&dev_base_lock); 231 list_add_tail(&dev->dev_list, &net->dev_base_head); 232 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); 233 hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); 234 write_unlock_bh(&dev_base_lock); 235 return 0; 236 } 237 238 /* Device list removal */ 239 static void unlist_netdevice(struct net_device *dev) 240 { 241 ASSERT_RTNL(); 242 243 /* Unlink dev from the device chain */ 244 write_lock_bh(&dev_base_lock); 245 list_del(&dev->dev_list); 246 hlist_del(&dev->name_hlist); 247 hlist_del(&dev->index_hlist); 248 write_unlock_bh(&dev_base_lock); 249 } 250 251 /* 252 * Our notifier list 253 */ 254 255 static RAW_NOTIFIER_HEAD(netdev_chain); 256 257 /* 258 * Device drivers call our routines to queue packets here. We empty the 259 * queue in the local softnet handler. 260 */ 261 262 DEFINE_PER_CPU(struct softnet_data, softnet_data); 263 264 #ifdef CONFIG_DEBUG_LOCK_ALLOC 265 /* 266 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 267 * according to dev->type 268 */ 269 static const unsigned short netdev_lock_type[] = 270 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 271 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 272 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 273 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 274 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 275 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 276 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 277 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 278 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 279 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 280 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 281 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 282 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, 283 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID, 284 ARPHRD_NONE}; 285 286 static const char *netdev_lock_name[] = 287 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 288 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 289 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 290 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 291 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 292 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 293 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 294 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 295 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 296 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 297 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 298 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 299 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", 300 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID", 301 "_xmit_NONE"}; 302 303 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 304 305 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 306 { 307 int i; 308 309 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 310 if (netdev_lock_type[i] == dev_type) 311 return i; 312 /* the last key is used by default */ 313 return ARRAY_SIZE(netdev_lock_type) - 1; 314 } 315 316 static inline void netdev_set_lockdep_class(spinlock_t *lock, 317 unsigned short dev_type) 318 { 319 int i; 320 321 i = netdev_lock_pos(dev_type); 322 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 323 netdev_lock_name[i]); 324 } 325 #else 326 static inline void netdev_set_lockdep_class(spinlock_t *lock, 327 unsigned short dev_type) 328 { 329 } 330 #endif 331 332 /******************************************************************************* 333 334 Protocol management and registration routines 335 336 *******************************************************************************/ 337 338 /* 339 * Add a protocol ID to the list. Now that the input handler is 340 * smarter we can dispense with all the messy stuff that used to be 341 * here. 342 * 343 * BEWARE!!! Protocol handlers, mangling input packets, 344 * MUST BE last in hash buckets and checking protocol handlers 345 * MUST start from promiscuous ptype_all chain in net_bh. 346 * It is true now, do not change it. 347 * Explanation follows: if protocol handler, mangling packet, will 348 * be the first on list, it is not able to sense, that packet 349 * is cloned and should be copied-on-write, so that it will 350 * change it and subsequent readers will get broken packet. 351 * --ANK (980803) 352 */ 353 354 /** 355 * dev_add_pack - add packet handler 356 * @pt: packet type declaration 357 * 358 * Add a protocol handler to the networking stack. The passed &packet_type 359 * is linked into kernel lists and may not be freed until it has been 360 * removed from the kernel lists. 361 * 362 * This call does not sleep therefore it can not 363 * guarantee all CPU's that are in middle of receiving packets 364 * will see the new packet type (until the next received packet). 365 */ 366 367 void dev_add_pack(struct packet_type *pt) 368 { 369 int hash; 370 371 spin_lock_bh(&ptype_lock); 372 if (pt->type == htons(ETH_P_ALL)) 373 list_add_rcu(&pt->list, &ptype_all); 374 else { 375 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 376 list_add_rcu(&pt->list, &ptype_base[hash]); 377 } 378 spin_unlock_bh(&ptype_lock); 379 } 380 381 /** 382 * __dev_remove_pack - remove packet handler 383 * @pt: packet type declaration 384 * 385 * Remove a protocol handler that was previously added to the kernel 386 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 387 * from the kernel lists and can be freed or reused once this function 388 * returns. 389 * 390 * The packet type might still be in use by receivers 391 * and must not be freed until after all the CPU's have gone 392 * through a quiescent state. 393 */ 394 void __dev_remove_pack(struct packet_type *pt) 395 { 396 struct list_head *head; 397 struct packet_type *pt1; 398 399 spin_lock_bh(&ptype_lock); 400 401 if (pt->type == htons(ETH_P_ALL)) 402 head = &ptype_all; 403 else 404 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 405 406 list_for_each_entry(pt1, head, list) { 407 if (pt == pt1) { 408 list_del_rcu(&pt->list); 409 goto out; 410 } 411 } 412 413 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 414 out: 415 spin_unlock_bh(&ptype_lock); 416 } 417 /** 418 * dev_remove_pack - remove packet handler 419 * @pt: packet type declaration 420 * 421 * Remove a protocol handler that was previously added to the kernel 422 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 423 * from the kernel lists and can be freed or reused once this function 424 * returns. 425 * 426 * This call sleeps to guarantee that no CPU is looking at the packet 427 * type after return. 428 */ 429 void dev_remove_pack(struct packet_type *pt) 430 { 431 __dev_remove_pack(pt); 432 433 synchronize_net(); 434 } 435 436 /****************************************************************************** 437 438 Device Boot-time Settings Routines 439 440 *******************************************************************************/ 441 442 /* Boot time configuration table */ 443 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 444 445 /** 446 * netdev_boot_setup_add - add new setup entry 447 * @name: name of the device 448 * @map: configured settings for the device 449 * 450 * Adds new setup entry to the dev_boot_setup list. The function 451 * returns 0 on error and 1 on success. This is a generic routine to 452 * all netdevices. 453 */ 454 static int netdev_boot_setup_add(char *name, struct ifmap *map) 455 { 456 struct netdev_boot_setup *s; 457 int i; 458 459 s = dev_boot_setup; 460 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 461 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 462 memset(s[i].name, 0, sizeof(s[i].name)); 463 strlcpy(s[i].name, name, IFNAMSIZ); 464 memcpy(&s[i].map, map, sizeof(s[i].map)); 465 break; 466 } 467 } 468 469 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 470 } 471 472 /** 473 * netdev_boot_setup_check - check boot time settings 474 * @dev: the netdevice 475 * 476 * Check boot time settings for the device. 477 * The found settings are set for the device to be used 478 * later in the device probing. 479 * Returns 0 if no settings found, 1 if they are. 480 */ 481 int netdev_boot_setup_check(struct net_device *dev) 482 { 483 struct netdev_boot_setup *s = dev_boot_setup; 484 int i; 485 486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 487 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 488 !strcmp(dev->name, s[i].name)) { 489 dev->irq = s[i].map.irq; 490 dev->base_addr = s[i].map.base_addr; 491 dev->mem_start = s[i].map.mem_start; 492 dev->mem_end = s[i].map.mem_end; 493 return 1; 494 } 495 } 496 return 0; 497 } 498 499 500 /** 501 * netdev_boot_base - get address from boot time settings 502 * @prefix: prefix for network device 503 * @unit: id for network device 504 * 505 * Check boot time settings for the base address of device. 506 * The found settings are set for the device to be used 507 * later in the device probing. 508 * Returns 0 if no settings found. 509 */ 510 unsigned long netdev_boot_base(const char *prefix, int unit) 511 { 512 const struct netdev_boot_setup *s = dev_boot_setup; 513 char name[IFNAMSIZ]; 514 int i; 515 516 sprintf(name, "%s%d", prefix, unit); 517 518 /* 519 * If device already registered then return base of 1 520 * to indicate not to probe for this interface 521 */ 522 if (__dev_get_by_name(&init_net, name)) 523 return 1; 524 525 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 526 if (!strcmp(name, s[i].name)) 527 return s[i].map.base_addr; 528 return 0; 529 } 530 531 /* 532 * Saves at boot time configured settings for any netdevice. 533 */ 534 int __init netdev_boot_setup(char *str) 535 { 536 int ints[5]; 537 struct ifmap map; 538 539 str = get_options(str, ARRAY_SIZE(ints), ints); 540 if (!str || !*str) 541 return 0; 542 543 /* Save settings */ 544 memset(&map, 0, sizeof(map)); 545 if (ints[0] > 0) 546 map.irq = ints[1]; 547 if (ints[0] > 1) 548 map.base_addr = ints[2]; 549 if (ints[0] > 2) 550 map.mem_start = ints[3]; 551 if (ints[0] > 3) 552 map.mem_end = ints[4]; 553 554 /* Add new entry to the list */ 555 return netdev_boot_setup_add(str, &map); 556 } 557 558 __setup("netdev=", netdev_boot_setup); 559 560 /******************************************************************************* 561 562 Device Interface Subroutines 563 564 *******************************************************************************/ 565 566 /** 567 * __dev_get_by_name - find a device by its name 568 * @net: the applicable net namespace 569 * @name: name to find 570 * 571 * Find an interface by name. Must be called under RTNL semaphore 572 * or @dev_base_lock. If the name is found a pointer to the device 573 * is returned. If the name is not found then %NULL is returned. The 574 * reference counters are not incremented so the caller must be 575 * careful with locks. 576 */ 577 578 struct net_device *__dev_get_by_name(struct net *net, const char *name) 579 { 580 struct hlist_node *p; 581 582 hlist_for_each(p, dev_name_hash(net, name)) { 583 struct net_device *dev 584 = hlist_entry(p, struct net_device, name_hlist); 585 if (!strncmp(dev->name, name, IFNAMSIZ)) 586 return dev; 587 } 588 return NULL; 589 } 590 591 /** 592 * dev_get_by_name - find a device by its name 593 * @net: the applicable net namespace 594 * @name: name to find 595 * 596 * Find an interface by name. This can be called from any 597 * context and does its own locking. The returned handle has 598 * the usage count incremented and the caller must use dev_put() to 599 * release it when it is no longer needed. %NULL is returned if no 600 * matching device is found. 601 */ 602 603 struct net_device *dev_get_by_name(struct net *net, const char *name) 604 { 605 struct net_device *dev; 606 607 read_lock(&dev_base_lock); 608 dev = __dev_get_by_name(net, name); 609 if (dev) 610 dev_hold(dev); 611 read_unlock(&dev_base_lock); 612 return dev; 613 } 614 615 /** 616 * __dev_get_by_index - find a device by its ifindex 617 * @net: the applicable net namespace 618 * @ifindex: index of device 619 * 620 * Search for an interface by index. Returns %NULL if the device 621 * is not found or a pointer to the device. The device has not 622 * had its reference counter increased so the caller must be careful 623 * about locking. The caller must hold either the RTNL semaphore 624 * or @dev_base_lock. 625 */ 626 627 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 628 { 629 struct hlist_node *p; 630 631 hlist_for_each(p, dev_index_hash(net, ifindex)) { 632 struct net_device *dev 633 = hlist_entry(p, struct net_device, index_hlist); 634 if (dev->ifindex == ifindex) 635 return dev; 636 } 637 return NULL; 638 } 639 640 641 /** 642 * dev_get_by_index - find a device by its ifindex 643 * @net: the applicable net namespace 644 * @ifindex: index of device 645 * 646 * Search for an interface by index. Returns NULL if the device 647 * is not found or a pointer to the device. The device returned has 648 * had a reference added and the pointer is safe until the user calls 649 * dev_put to indicate they have finished with it. 650 */ 651 652 struct net_device *dev_get_by_index(struct net *net, int ifindex) 653 { 654 struct net_device *dev; 655 656 read_lock(&dev_base_lock); 657 dev = __dev_get_by_index(net, ifindex); 658 if (dev) 659 dev_hold(dev); 660 read_unlock(&dev_base_lock); 661 return dev; 662 } 663 664 /** 665 * dev_getbyhwaddr - find a device by its hardware address 666 * @net: the applicable net namespace 667 * @type: media type of device 668 * @ha: hardware address 669 * 670 * Search for an interface by MAC address. Returns NULL if the device 671 * is not found or a pointer to the device. The caller must hold the 672 * rtnl semaphore. The returned device has not had its ref count increased 673 * and the caller must therefore be careful about locking 674 * 675 * BUGS: 676 * If the API was consistent this would be __dev_get_by_hwaddr 677 */ 678 679 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) 680 { 681 struct net_device *dev; 682 683 ASSERT_RTNL(); 684 685 for_each_netdev(net, dev) 686 if (dev->type == type && 687 !memcmp(dev->dev_addr, ha, dev->addr_len)) 688 return dev; 689 690 return NULL; 691 } 692 693 EXPORT_SYMBOL(dev_getbyhwaddr); 694 695 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 696 { 697 struct net_device *dev; 698 699 ASSERT_RTNL(); 700 for_each_netdev(net, dev) 701 if (dev->type == type) 702 return dev; 703 704 return NULL; 705 } 706 707 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 708 709 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 710 { 711 struct net_device *dev; 712 713 rtnl_lock(); 714 dev = __dev_getfirstbyhwtype(net, type); 715 if (dev) 716 dev_hold(dev); 717 rtnl_unlock(); 718 return dev; 719 } 720 721 EXPORT_SYMBOL(dev_getfirstbyhwtype); 722 723 /** 724 * dev_get_by_flags - find any device with given flags 725 * @net: the applicable net namespace 726 * @if_flags: IFF_* values 727 * @mask: bitmask of bits in if_flags to check 728 * 729 * Search for any interface with the given flags. Returns NULL if a device 730 * is not found or a pointer to the device. The device returned has 731 * had a reference added and the pointer is safe until the user calls 732 * dev_put to indicate they have finished with it. 733 */ 734 735 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) 736 { 737 struct net_device *dev, *ret; 738 739 ret = NULL; 740 read_lock(&dev_base_lock); 741 for_each_netdev(net, dev) { 742 if (((dev->flags ^ if_flags) & mask) == 0) { 743 dev_hold(dev); 744 ret = dev; 745 break; 746 } 747 } 748 read_unlock(&dev_base_lock); 749 return ret; 750 } 751 752 /** 753 * dev_valid_name - check if name is okay for network device 754 * @name: name string 755 * 756 * Network device names need to be valid file names to 757 * to allow sysfs to work. We also disallow any kind of 758 * whitespace. 759 */ 760 int dev_valid_name(const char *name) 761 { 762 if (*name == '\0') 763 return 0; 764 if (strlen(name) >= IFNAMSIZ) 765 return 0; 766 if (!strcmp(name, ".") || !strcmp(name, "..")) 767 return 0; 768 769 while (*name) { 770 if (*name == '/' || isspace(*name)) 771 return 0; 772 name++; 773 } 774 return 1; 775 } 776 777 /** 778 * __dev_alloc_name - allocate a name for a device 779 * @net: network namespace to allocate the device name in 780 * @name: name format string 781 * @buf: scratch buffer and result name string 782 * 783 * Passed a format string - eg "lt%d" it will try and find a suitable 784 * id. It scans list of devices to build up a free map, then chooses 785 * the first empty slot. The caller must hold the dev_base or rtnl lock 786 * while allocating the name and adding the device in order to avoid 787 * duplicates. 788 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 789 * Returns the number of the unit assigned or a negative errno code. 790 */ 791 792 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 793 { 794 int i = 0; 795 const char *p; 796 const int max_netdevices = 8*PAGE_SIZE; 797 unsigned long *inuse; 798 struct net_device *d; 799 800 p = strnchr(name, IFNAMSIZ-1, '%'); 801 if (p) { 802 /* 803 * Verify the string as this thing may have come from 804 * the user. There must be either one "%d" and no other "%" 805 * characters. 806 */ 807 if (p[1] != 'd' || strchr(p + 2, '%')) 808 return -EINVAL; 809 810 /* Use one page as a bit array of possible slots */ 811 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 812 if (!inuse) 813 return -ENOMEM; 814 815 for_each_netdev(net, d) { 816 if (!sscanf(d->name, name, &i)) 817 continue; 818 if (i < 0 || i >= max_netdevices) 819 continue; 820 821 /* avoid cases where sscanf is not exact inverse of printf */ 822 snprintf(buf, IFNAMSIZ, name, i); 823 if (!strncmp(buf, d->name, IFNAMSIZ)) 824 set_bit(i, inuse); 825 } 826 827 i = find_first_zero_bit(inuse, max_netdevices); 828 free_page((unsigned long) inuse); 829 } 830 831 snprintf(buf, IFNAMSIZ, name, i); 832 if (!__dev_get_by_name(net, buf)) 833 return i; 834 835 /* It is possible to run out of possible slots 836 * when the name is long and there isn't enough space left 837 * for the digits, or if all bits are used. 838 */ 839 return -ENFILE; 840 } 841 842 /** 843 * dev_alloc_name - allocate a name for a device 844 * @dev: device 845 * @name: name format string 846 * 847 * Passed a format string - eg "lt%d" it will try and find a suitable 848 * id. It scans list of devices to build up a free map, then chooses 849 * the first empty slot. The caller must hold the dev_base or rtnl lock 850 * while allocating the name and adding the device in order to avoid 851 * duplicates. 852 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 853 * Returns the number of the unit assigned or a negative errno code. 854 */ 855 856 int dev_alloc_name(struct net_device *dev, const char *name) 857 { 858 char buf[IFNAMSIZ]; 859 struct net *net; 860 int ret; 861 862 BUG_ON(!dev_net(dev)); 863 net = dev_net(dev); 864 ret = __dev_alloc_name(net, name, buf); 865 if (ret >= 0) 866 strlcpy(dev->name, buf, IFNAMSIZ); 867 return ret; 868 } 869 870 871 /** 872 * dev_change_name - change name of a device 873 * @dev: device 874 * @newname: name (or format string) must be at least IFNAMSIZ 875 * 876 * Change name of a device, can pass format strings "eth%d". 877 * for wildcarding. 878 */ 879 int dev_change_name(struct net_device *dev, char *newname) 880 { 881 char oldname[IFNAMSIZ]; 882 int err = 0; 883 int ret; 884 struct net *net; 885 886 ASSERT_RTNL(); 887 BUG_ON(!dev_net(dev)); 888 889 net = dev_net(dev); 890 if (dev->flags & IFF_UP) 891 return -EBUSY; 892 893 if (!dev_valid_name(newname)) 894 return -EINVAL; 895 896 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) 897 return 0; 898 899 memcpy(oldname, dev->name, IFNAMSIZ); 900 901 if (strchr(newname, '%')) { 902 err = dev_alloc_name(dev, newname); 903 if (err < 0) 904 return err; 905 strcpy(newname, dev->name); 906 } 907 else if (__dev_get_by_name(net, newname)) 908 return -EEXIST; 909 else 910 strlcpy(dev->name, newname, IFNAMSIZ); 911 912 rollback: 913 err = device_rename(&dev->dev, dev->name); 914 if (err) { 915 memcpy(dev->name, oldname, IFNAMSIZ); 916 return err; 917 } 918 919 write_lock_bh(&dev_base_lock); 920 hlist_del(&dev->name_hlist); 921 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); 922 write_unlock_bh(&dev_base_lock); 923 924 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 925 ret = notifier_to_errno(ret); 926 927 if (ret) { 928 if (err) { 929 printk(KERN_ERR 930 "%s: name change rollback failed: %d.\n", 931 dev->name, ret); 932 } else { 933 err = ret; 934 memcpy(dev->name, oldname, IFNAMSIZ); 935 goto rollback; 936 } 937 } 938 939 return err; 940 } 941 942 /** 943 * netdev_features_change - device changes features 944 * @dev: device to cause notification 945 * 946 * Called to indicate a device has changed features. 947 */ 948 void netdev_features_change(struct net_device *dev) 949 { 950 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 951 } 952 EXPORT_SYMBOL(netdev_features_change); 953 954 /** 955 * netdev_state_change - device changes state 956 * @dev: device to cause notification 957 * 958 * Called to indicate a device has changed state. This function calls 959 * the notifier chains for netdev_chain and sends a NEWLINK message 960 * to the routing socket. 961 */ 962 void netdev_state_change(struct net_device *dev) 963 { 964 if (dev->flags & IFF_UP) { 965 call_netdevice_notifiers(NETDEV_CHANGE, dev); 966 rtmsg_ifinfo(RTM_NEWLINK, dev, 0); 967 } 968 } 969 970 void netdev_bonding_change(struct net_device *dev) 971 { 972 call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev); 973 } 974 EXPORT_SYMBOL(netdev_bonding_change); 975 976 /** 977 * dev_load - load a network module 978 * @net: the applicable net namespace 979 * @name: name of interface 980 * 981 * If a network interface is not present and the process has suitable 982 * privileges this function loads the module. If module loading is not 983 * available in this kernel then it becomes a nop. 984 */ 985 986 void dev_load(struct net *net, const char *name) 987 { 988 struct net_device *dev; 989 990 read_lock(&dev_base_lock); 991 dev = __dev_get_by_name(net, name); 992 read_unlock(&dev_base_lock); 993 994 if (!dev && capable(CAP_SYS_MODULE)) 995 request_module("%s", name); 996 } 997 998 /** 999 * dev_open - prepare an interface for use. 1000 * @dev: device to open 1001 * 1002 * Takes a device from down to up state. The device's private open 1003 * function is invoked and then the multicast lists are loaded. Finally 1004 * the device is moved into the up state and a %NETDEV_UP message is 1005 * sent to the netdev notifier chain. 1006 * 1007 * Calling this function on an active interface is a nop. On a failure 1008 * a negative errno code is returned. 1009 */ 1010 int dev_open(struct net_device *dev) 1011 { 1012 int ret = 0; 1013 1014 ASSERT_RTNL(); 1015 1016 /* 1017 * Is it already up? 1018 */ 1019 1020 if (dev->flags & IFF_UP) 1021 return 0; 1022 1023 /* 1024 * Is it even present? 1025 */ 1026 if (!netif_device_present(dev)) 1027 return -ENODEV; 1028 1029 /* 1030 * Call device private open method 1031 */ 1032 set_bit(__LINK_STATE_START, &dev->state); 1033 1034 if (dev->validate_addr) 1035 ret = dev->validate_addr(dev); 1036 1037 if (!ret && dev->open) 1038 ret = dev->open(dev); 1039 1040 /* 1041 * If it went open OK then: 1042 */ 1043 1044 if (ret) 1045 clear_bit(__LINK_STATE_START, &dev->state); 1046 else { 1047 /* 1048 * Set the flags. 1049 */ 1050 dev->flags |= IFF_UP; 1051 1052 /* 1053 * Initialize multicasting status 1054 */ 1055 dev_set_rx_mode(dev); 1056 1057 /* 1058 * Wakeup transmit queue engine 1059 */ 1060 dev_activate(dev); 1061 1062 /* 1063 * ... and announce new interface. 1064 */ 1065 call_netdevice_notifiers(NETDEV_UP, dev); 1066 } 1067 1068 return ret; 1069 } 1070 1071 /** 1072 * dev_close - shutdown an interface. 1073 * @dev: device to shutdown 1074 * 1075 * This function moves an active device into down state. A 1076 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1077 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1078 * chain. 1079 */ 1080 int dev_close(struct net_device *dev) 1081 { 1082 ASSERT_RTNL(); 1083 1084 might_sleep(); 1085 1086 if (!(dev->flags & IFF_UP)) 1087 return 0; 1088 1089 /* 1090 * Tell people we are going down, so that they can 1091 * prepare to death, when device is still operating. 1092 */ 1093 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1094 1095 clear_bit(__LINK_STATE_START, &dev->state); 1096 1097 /* Synchronize to scheduled poll. We cannot touch poll list, 1098 * it can be even on different cpu. So just clear netif_running(). 1099 * 1100 * dev->stop() will invoke napi_disable() on all of it's 1101 * napi_struct instances on this device. 1102 */ 1103 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 1104 1105 dev_deactivate(dev); 1106 1107 /* 1108 * Call the device specific close. This cannot fail. 1109 * Only if device is UP 1110 * 1111 * We allow it to be called even after a DETACH hot-plug 1112 * event. 1113 */ 1114 if (dev->stop) 1115 dev->stop(dev); 1116 1117 /* 1118 * Device is now down. 1119 */ 1120 1121 dev->flags &= ~IFF_UP; 1122 1123 /* 1124 * Tell people we are down 1125 */ 1126 call_netdevice_notifiers(NETDEV_DOWN, dev); 1127 1128 return 0; 1129 } 1130 1131 1132 /** 1133 * dev_disable_lro - disable Large Receive Offload on a device 1134 * @dev: device 1135 * 1136 * Disable Large Receive Offload (LRO) on a net device. Must be 1137 * called under RTNL. This is needed if received packets may be 1138 * forwarded to another interface. 1139 */ 1140 void dev_disable_lro(struct net_device *dev) 1141 { 1142 if (dev->ethtool_ops && dev->ethtool_ops->get_flags && 1143 dev->ethtool_ops->set_flags) { 1144 u32 flags = dev->ethtool_ops->get_flags(dev); 1145 if (flags & ETH_FLAG_LRO) { 1146 flags &= ~ETH_FLAG_LRO; 1147 dev->ethtool_ops->set_flags(dev, flags); 1148 } 1149 } 1150 WARN_ON(dev->features & NETIF_F_LRO); 1151 } 1152 EXPORT_SYMBOL(dev_disable_lro); 1153 1154 1155 static int dev_boot_phase = 1; 1156 1157 /* 1158 * Device change register/unregister. These are not inline or static 1159 * as we export them to the world. 1160 */ 1161 1162 /** 1163 * register_netdevice_notifier - register a network notifier block 1164 * @nb: notifier 1165 * 1166 * Register a notifier to be called when network device events occur. 1167 * The notifier passed is linked into the kernel structures and must 1168 * not be reused until it has been unregistered. A negative errno code 1169 * is returned on a failure. 1170 * 1171 * When registered all registration and up events are replayed 1172 * to the new notifier to allow device to have a race free 1173 * view of the network device list. 1174 */ 1175 1176 int register_netdevice_notifier(struct notifier_block *nb) 1177 { 1178 struct net_device *dev; 1179 struct net_device *last; 1180 struct net *net; 1181 int err; 1182 1183 rtnl_lock(); 1184 err = raw_notifier_chain_register(&netdev_chain, nb); 1185 if (err) 1186 goto unlock; 1187 if (dev_boot_phase) 1188 goto unlock; 1189 for_each_net(net) { 1190 for_each_netdev(net, dev) { 1191 err = nb->notifier_call(nb, NETDEV_REGISTER, dev); 1192 err = notifier_to_errno(err); 1193 if (err) 1194 goto rollback; 1195 1196 if (!(dev->flags & IFF_UP)) 1197 continue; 1198 1199 nb->notifier_call(nb, NETDEV_UP, dev); 1200 } 1201 } 1202 1203 unlock: 1204 rtnl_unlock(); 1205 return err; 1206 1207 rollback: 1208 last = dev; 1209 for_each_net(net) { 1210 for_each_netdev(net, dev) { 1211 if (dev == last) 1212 break; 1213 1214 if (dev->flags & IFF_UP) { 1215 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); 1216 nb->notifier_call(nb, NETDEV_DOWN, dev); 1217 } 1218 nb->notifier_call(nb, NETDEV_UNREGISTER, dev); 1219 } 1220 } 1221 1222 raw_notifier_chain_unregister(&netdev_chain, nb); 1223 goto unlock; 1224 } 1225 1226 /** 1227 * unregister_netdevice_notifier - unregister a network notifier block 1228 * @nb: notifier 1229 * 1230 * Unregister a notifier previously registered by 1231 * register_netdevice_notifier(). The notifier is unlinked into the 1232 * kernel structures and may then be reused. A negative errno code 1233 * is returned on a failure. 1234 */ 1235 1236 int unregister_netdevice_notifier(struct notifier_block *nb) 1237 { 1238 int err; 1239 1240 rtnl_lock(); 1241 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1242 rtnl_unlock(); 1243 return err; 1244 } 1245 1246 /** 1247 * call_netdevice_notifiers - call all network notifier blocks 1248 * @val: value passed unmodified to notifier function 1249 * @dev: net_device pointer passed unmodified to notifier function 1250 * 1251 * Call all network notifier blocks. Parameters and return value 1252 * are as for raw_notifier_call_chain(). 1253 */ 1254 1255 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1256 { 1257 return raw_notifier_call_chain(&netdev_chain, val, dev); 1258 } 1259 1260 /* When > 0 there are consumers of rx skb time stamps */ 1261 static atomic_t netstamp_needed = ATOMIC_INIT(0); 1262 1263 void net_enable_timestamp(void) 1264 { 1265 atomic_inc(&netstamp_needed); 1266 } 1267 1268 void net_disable_timestamp(void) 1269 { 1270 atomic_dec(&netstamp_needed); 1271 } 1272 1273 static inline void net_timestamp(struct sk_buff *skb) 1274 { 1275 if (atomic_read(&netstamp_needed)) 1276 __net_timestamp(skb); 1277 else 1278 skb->tstamp.tv64 = 0; 1279 } 1280 1281 /* 1282 * Support routine. Sends outgoing frames to any network 1283 * taps currently in use. 1284 */ 1285 1286 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1287 { 1288 struct packet_type *ptype; 1289 1290 net_timestamp(skb); 1291 1292 rcu_read_lock(); 1293 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1294 /* Never send packets back to the socket 1295 * they originated from - MvS (miquels@drinkel.ow.org) 1296 */ 1297 if ((ptype->dev == dev || !ptype->dev) && 1298 (ptype->af_packet_priv == NULL || 1299 (struct sock *)ptype->af_packet_priv != skb->sk)) { 1300 struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); 1301 if (!skb2) 1302 break; 1303 1304 /* skb->nh should be correctly 1305 set by sender, so that the second statement is 1306 just protection against buggy protocols. 1307 */ 1308 skb_reset_mac_header(skb2); 1309 1310 if (skb_network_header(skb2) < skb2->data || 1311 skb2->network_header > skb2->tail) { 1312 if (net_ratelimit()) 1313 printk(KERN_CRIT "protocol %04x is " 1314 "buggy, dev %s\n", 1315 skb2->protocol, dev->name); 1316 skb_reset_network_header(skb2); 1317 } 1318 1319 skb2->transport_header = skb2->network_header; 1320 skb2->pkt_type = PACKET_OUTGOING; 1321 ptype->func(skb2, skb->dev, ptype, skb->dev); 1322 } 1323 } 1324 rcu_read_unlock(); 1325 } 1326 1327 1328 void __netif_schedule(struct Qdisc *q) 1329 { 1330 if (WARN_ON_ONCE(q == &noop_qdisc)) 1331 return; 1332 1333 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) { 1334 struct softnet_data *sd; 1335 unsigned long flags; 1336 1337 local_irq_save(flags); 1338 sd = &__get_cpu_var(softnet_data); 1339 q->next_sched = sd->output_queue; 1340 sd->output_queue = q; 1341 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1342 local_irq_restore(flags); 1343 } 1344 } 1345 EXPORT_SYMBOL(__netif_schedule); 1346 1347 void dev_kfree_skb_irq(struct sk_buff *skb) 1348 { 1349 if (atomic_dec_and_test(&skb->users)) { 1350 struct softnet_data *sd; 1351 unsigned long flags; 1352 1353 local_irq_save(flags); 1354 sd = &__get_cpu_var(softnet_data); 1355 skb->next = sd->completion_queue; 1356 sd->completion_queue = skb; 1357 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1358 local_irq_restore(flags); 1359 } 1360 } 1361 EXPORT_SYMBOL(dev_kfree_skb_irq); 1362 1363 void dev_kfree_skb_any(struct sk_buff *skb) 1364 { 1365 if (in_irq() || irqs_disabled()) 1366 dev_kfree_skb_irq(skb); 1367 else 1368 dev_kfree_skb(skb); 1369 } 1370 EXPORT_SYMBOL(dev_kfree_skb_any); 1371 1372 1373 /** 1374 * netif_device_detach - mark device as removed 1375 * @dev: network device 1376 * 1377 * Mark device as removed from system and therefore no longer available. 1378 */ 1379 void netif_device_detach(struct net_device *dev) 1380 { 1381 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 1382 netif_running(dev)) { 1383 netif_stop_queue(dev); 1384 } 1385 } 1386 EXPORT_SYMBOL(netif_device_detach); 1387 1388 /** 1389 * netif_device_attach - mark device as attached 1390 * @dev: network device 1391 * 1392 * Mark device as attached from system and restart if needed. 1393 */ 1394 void netif_device_attach(struct net_device *dev) 1395 { 1396 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 1397 netif_running(dev)) { 1398 netif_wake_queue(dev); 1399 __netdev_watchdog_up(dev); 1400 } 1401 } 1402 EXPORT_SYMBOL(netif_device_attach); 1403 1404 static bool can_checksum_protocol(unsigned long features, __be16 protocol) 1405 { 1406 return ((features & NETIF_F_GEN_CSUM) || 1407 ((features & NETIF_F_IP_CSUM) && 1408 protocol == htons(ETH_P_IP)) || 1409 ((features & NETIF_F_IPV6_CSUM) && 1410 protocol == htons(ETH_P_IPV6))); 1411 } 1412 1413 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) 1414 { 1415 if (can_checksum_protocol(dev->features, skb->protocol)) 1416 return true; 1417 1418 if (skb->protocol == htons(ETH_P_8021Q)) { 1419 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 1420 if (can_checksum_protocol(dev->features & dev->vlan_features, 1421 veh->h_vlan_encapsulated_proto)) 1422 return true; 1423 } 1424 1425 return false; 1426 } 1427 1428 /* 1429 * Invalidate hardware checksum when packet is to be mangled, and 1430 * complete checksum manually on outgoing path. 1431 */ 1432 int skb_checksum_help(struct sk_buff *skb) 1433 { 1434 __wsum csum; 1435 int ret = 0, offset; 1436 1437 if (skb->ip_summed == CHECKSUM_COMPLETE) 1438 goto out_set_summed; 1439 1440 if (unlikely(skb_shinfo(skb)->gso_size)) { 1441 /* Let GSO fix up the checksum. */ 1442 goto out_set_summed; 1443 } 1444 1445 offset = skb->csum_start - skb_headroom(skb); 1446 BUG_ON(offset >= skb_headlen(skb)); 1447 csum = skb_checksum(skb, offset, skb->len - offset, 0); 1448 1449 offset += skb->csum_offset; 1450 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 1451 1452 if (skb_cloned(skb) && 1453 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 1454 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 1455 if (ret) 1456 goto out; 1457 } 1458 1459 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 1460 out_set_summed: 1461 skb->ip_summed = CHECKSUM_NONE; 1462 out: 1463 return ret; 1464 } 1465 1466 /** 1467 * skb_gso_segment - Perform segmentation on skb. 1468 * @skb: buffer to segment 1469 * @features: features for the output path (see dev->features) 1470 * 1471 * This function segments the given skb and returns a list of segments. 1472 * 1473 * It may return NULL if the skb requires no segmentation. This is 1474 * only possible when GSO is used for verifying header integrity. 1475 */ 1476 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) 1477 { 1478 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1479 struct packet_type *ptype; 1480 __be16 type = skb->protocol; 1481 int err; 1482 1483 BUG_ON(skb_shinfo(skb)->frag_list); 1484 1485 skb_reset_mac_header(skb); 1486 skb->mac_len = skb->network_header - skb->mac_header; 1487 __skb_pull(skb, skb->mac_len); 1488 1489 if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) { 1490 if (skb_header_cloned(skb) && 1491 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 1492 return ERR_PTR(err); 1493 } 1494 1495 rcu_read_lock(); 1496 list_for_each_entry_rcu(ptype, 1497 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 1498 if (ptype->type == type && !ptype->dev && ptype->gso_segment) { 1499 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1500 err = ptype->gso_send_check(skb); 1501 segs = ERR_PTR(err); 1502 if (err || skb_gso_ok(skb, features)) 1503 break; 1504 __skb_push(skb, (skb->data - 1505 skb_network_header(skb))); 1506 } 1507 segs = ptype->gso_segment(skb, features); 1508 break; 1509 } 1510 } 1511 rcu_read_unlock(); 1512 1513 __skb_push(skb, skb->data - skb_mac_header(skb)); 1514 1515 return segs; 1516 } 1517 1518 EXPORT_SYMBOL(skb_gso_segment); 1519 1520 /* Take action when hardware reception checksum errors are detected. */ 1521 #ifdef CONFIG_BUG 1522 void netdev_rx_csum_fault(struct net_device *dev) 1523 { 1524 if (net_ratelimit()) { 1525 printk(KERN_ERR "%s: hw csum failure.\n", 1526 dev ? dev->name : "<unknown>"); 1527 dump_stack(); 1528 } 1529 } 1530 EXPORT_SYMBOL(netdev_rx_csum_fault); 1531 #endif 1532 1533 /* Actually, we should eliminate this check as soon as we know, that: 1534 * 1. IOMMU is present and allows to map all the memory. 1535 * 2. No high memory really exists on this machine. 1536 */ 1537 1538 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 1539 { 1540 #ifdef CONFIG_HIGHMEM 1541 int i; 1542 1543 if (dev->features & NETIF_F_HIGHDMA) 1544 return 0; 1545 1546 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1547 if (PageHighMem(skb_shinfo(skb)->frags[i].page)) 1548 return 1; 1549 1550 #endif 1551 return 0; 1552 } 1553 1554 struct dev_gso_cb { 1555 void (*destructor)(struct sk_buff *skb); 1556 }; 1557 1558 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) 1559 1560 static void dev_gso_skb_destructor(struct sk_buff *skb) 1561 { 1562 struct dev_gso_cb *cb; 1563 1564 do { 1565 struct sk_buff *nskb = skb->next; 1566 1567 skb->next = nskb->next; 1568 nskb->next = NULL; 1569 kfree_skb(nskb); 1570 } while (skb->next); 1571 1572 cb = DEV_GSO_CB(skb); 1573 if (cb->destructor) 1574 cb->destructor(skb); 1575 } 1576 1577 /** 1578 * dev_gso_segment - Perform emulated hardware segmentation on skb. 1579 * @skb: buffer to segment 1580 * 1581 * This function segments the given skb and stores the list of segments 1582 * in skb->next. 1583 */ 1584 static int dev_gso_segment(struct sk_buff *skb) 1585 { 1586 struct net_device *dev = skb->dev; 1587 struct sk_buff *segs; 1588 int features = dev->features & ~(illegal_highdma(dev, skb) ? 1589 NETIF_F_SG : 0); 1590 1591 segs = skb_gso_segment(skb, features); 1592 1593 /* Verifying header integrity only. */ 1594 if (!segs) 1595 return 0; 1596 1597 if (IS_ERR(segs)) 1598 return PTR_ERR(segs); 1599 1600 skb->next = segs; 1601 DEV_GSO_CB(skb)->destructor = skb->destructor; 1602 skb->destructor = dev_gso_skb_destructor; 1603 1604 return 0; 1605 } 1606 1607 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 1608 struct netdev_queue *txq) 1609 { 1610 if (likely(!skb->next)) { 1611 if (!list_empty(&ptype_all)) 1612 dev_queue_xmit_nit(skb, dev); 1613 1614 if (netif_needs_gso(dev, skb)) { 1615 if (unlikely(dev_gso_segment(skb))) 1616 goto out_kfree_skb; 1617 if (skb->next) 1618 goto gso; 1619 } 1620 1621 return dev->hard_start_xmit(skb, dev); 1622 } 1623 1624 gso: 1625 do { 1626 struct sk_buff *nskb = skb->next; 1627 int rc; 1628 1629 skb->next = nskb->next; 1630 nskb->next = NULL; 1631 rc = dev->hard_start_xmit(nskb, dev); 1632 if (unlikely(rc)) { 1633 nskb->next = skb->next; 1634 skb->next = nskb; 1635 return rc; 1636 } 1637 if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) 1638 return NETDEV_TX_BUSY; 1639 } while (skb->next); 1640 1641 skb->destructor = DEV_GSO_CB(skb)->destructor; 1642 1643 out_kfree_skb: 1644 kfree_skb(skb); 1645 return 0; 1646 } 1647 1648 /** 1649 * dev_queue_xmit - transmit a buffer 1650 * @skb: buffer to transmit 1651 * 1652 * Queue a buffer for transmission to a network device. The caller must 1653 * have set the device and priority and built the buffer before calling 1654 * this function. The function can be called from an interrupt. 1655 * 1656 * A negative errno code is returned on a failure. A success does not 1657 * guarantee the frame will be transmitted as it may be dropped due 1658 * to congestion or traffic shaping. 1659 * 1660 * ----------------------------------------------------------------------------------- 1661 * I notice this method can also return errors from the queue disciplines, 1662 * including NET_XMIT_DROP, which is a positive value. So, errors can also 1663 * be positive. 1664 * 1665 * Regardless of the return value, the skb is consumed, so it is currently 1666 * difficult to retry a send to this method. (You can bump the ref count 1667 * before sending to hold a reference for retry if you are careful.) 1668 * 1669 * When calling this method, interrupts MUST be enabled. This is because 1670 * the BH enable code must have IRQs enabled so that it will not deadlock. 1671 * --BLG 1672 */ 1673 1674 static u32 simple_tx_hashrnd; 1675 static int simple_tx_hashrnd_initialized = 0; 1676 1677 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) 1678 { 1679 u32 addr1, addr2, ports; 1680 u32 hash, ihl; 1681 u8 ip_proto; 1682 1683 if (unlikely(!simple_tx_hashrnd_initialized)) { 1684 get_random_bytes(&simple_tx_hashrnd, 4); 1685 simple_tx_hashrnd_initialized = 1; 1686 } 1687 1688 switch (skb->protocol) { 1689 case __constant_htons(ETH_P_IP): 1690 ip_proto = ip_hdr(skb)->protocol; 1691 addr1 = ip_hdr(skb)->saddr; 1692 addr2 = ip_hdr(skb)->daddr; 1693 ihl = ip_hdr(skb)->ihl; 1694 break; 1695 case __constant_htons(ETH_P_IPV6): 1696 ip_proto = ipv6_hdr(skb)->nexthdr; 1697 addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3]; 1698 addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3]; 1699 ihl = (40 >> 2); 1700 break; 1701 default: 1702 return 0; 1703 } 1704 1705 1706 switch (ip_proto) { 1707 case IPPROTO_TCP: 1708 case IPPROTO_UDP: 1709 case IPPROTO_DCCP: 1710 case IPPROTO_ESP: 1711 case IPPROTO_AH: 1712 case IPPROTO_SCTP: 1713 case IPPROTO_UDPLITE: 1714 ports = *((u32 *) (skb_network_header(skb) + (ihl * 4))); 1715 break; 1716 1717 default: 1718 ports = 0; 1719 break; 1720 } 1721 1722 hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd); 1723 1724 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 1725 } 1726 1727 static struct netdev_queue *dev_pick_tx(struct net_device *dev, 1728 struct sk_buff *skb) 1729 { 1730 u16 queue_index = 0; 1731 1732 if (dev->select_queue) 1733 queue_index = dev->select_queue(dev, skb); 1734 else if (dev->real_num_tx_queues > 1) 1735 queue_index = simple_tx_hash(dev, skb); 1736 1737 skb_set_queue_mapping(skb, queue_index); 1738 return netdev_get_tx_queue(dev, queue_index); 1739 } 1740 1741 int dev_queue_xmit(struct sk_buff *skb) 1742 { 1743 struct net_device *dev = skb->dev; 1744 struct netdev_queue *txq; 1745 struct Qdisc *q; 1746 int rc = -ENOMEM; 1747 1748 /* GSO will handle the following emulations directly. */ 1749 if (netif_needs_gso(dev, skb)) 1750 goto gso; 1751 1752 if (skb_shinfo(skb)->frag_list && 1753 !(dev->features & NETIF_F_FRAGLIST) && 1754 __skb_linearize(skb)) 1755 goto out_kfree_skb; 1756 1757 /* Fragmented skb is linearized if device does not support SG, 1758 * or if at least one of fragments is in highmem and device 1759 * does not support DMA from it. 1760 */ 1761 if (skb_shinfo(skb)->nr_frags && 1762 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && 1763 __skb_linearize(skb)) 1764 goto out_kfree_skb; 1765 1766 /* If packet is not checksummed and device does not support 1767 * checksumming for this protocol, complete checksumming here. 1768 */ 1769 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1770 skb_set_transport_header(skb, skb->csum_start - 1771 skb_headroom(skb)); 1772 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb)) 1773 goto out_kfree_skb; 1774 } 1775 1776 gso: 1777 /* Disable soft irqs for various locks below. Also 1778 * stops preemption for RCU. 1779 */ 1780 rcu_read_lock_bh(); 1781 1782 txq = dev_pick_tx(dev, skb); 1783 q = rcu_dereference(txq->qdisc); 1784 1785 #ifdef CONFIG_NET_CLS_ACT 1786 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); 1787 #endif 1788 if (q->enqueue) { 1789 spinlock_t *root_lock = qdisc_root_lock(q); 1790 1791 spin_lock(root_lock); 1792 1793 rc = qdisc_enqueue_root(skb, q); 1794 qdisc_run(q); 1795 1796 spin_unlock(root_lock); 1797 1798 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; 1799 goto out; 1800 } 1801 1802 /* The device has no queue. Common case for software devices: 1803 loopback, all the sorts of tunnels... 1804 1805 Really, it is unlikely that netif_tx_lock protection is necessary 1806 here. (f.e. loopback and IP tunnels are clean ignoring statistics 1807 counters.) 1808 However, it is possible, that they rely on protection 1809 made by us here. 1810 1811 Check this and shot the lock. It is not prone from deadlocks. 1812 Either shot noqueue qdisc, it is even simpler 8) 1813 */ 1814 if (dev->flags & IFF_UP) { 1815 int cpu = smp_processor_id(); /* ok because BHs are off */ 1816 1817 if (txq->xmit_lock_owner != cpu) { 1818 1819 HARD_TX_LOCK(dev, txq, cpu); 1820 1821 if (!netif_tx_queue_stopped(txq)) { 1822 rc = 0; 1823 if (!dev_hard_start_xmit(skb, dev, txq)) { 1824 HARD_TX_UNLOCK(dev, txq); 1825 goto out; 1826 } 1827 } 1828 HARD_TX_UNLOCK(dev, txq); 1829 if (net_ratelimit()) 1830 printk(KERN_CRIT "Virtual device %s asks to " 1831 "queue packet!\n", dev->name); 1832 } else { 1833 /* Recursion is detected! It is possible, 1834 * unfortunately */ 1835 if (net_ratelimit()) 1836 printk(KERN_CRIT "Dead loop on virtual device " 1837 "%s, fix it urgently!\n", dev->name); 1838 } 1839 } 1840 1841 rc = -ENETDOWN; 1842 rcu_read_unlock_bh(); 1843 1844 out_kfree_skb: 1845 kfree_skb(skb); 1846 return rc; 1847 out: 1848 rcu_read_unlock_bh(); 1849 return rc; 1850 } 1851 1852 1853 /*======================================================================= 1854 Receiver routines 1855 =======================================================================*/ 1856 1857 int netdev_max_backlog __read_mostly = 1000; 1858 int netdev_budget __read_mostly = 300; 1859 int weight_p __read_mostly = 64; /* old backlog weight */ 1860 1861 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 1862 1863 1864 /** 1865 * netif_rx - post buffer to the network code 1866 * @skb: buffer to post 1867 * 1868 * This function receives a packet from a device driver and queues it for 1869 * the upper (protocol) levels to process. It always succeeds. The buffer 1870 * may be dropped during processing for congestion control or by the 1871 * protocol layers. 1872 * 1873 * return values: 1874 * NET_RX_SUCCESS (no congestion) 1875 * NET_RX_DROP (packet was dropped) 1876 * 1877 */ 1878 1879 int netif_rx(struct sk_buff *skb) 1880 { 1881 struct softnet_data *queue; 1882 unsigned long flags; 1883 1884 /* if netpoll wants it, pretend we never saw it */ 1885 if (netpoll_rx(skb)) 1886 return NET_RX_DROP; 1887 1888 if (!skb->tstamp.tv64) 1889 net_timestamp(skb); 1890 1891 /* 1892 * The code is rearranged so that the path is the most 1893 * short when CPU is congested, but is still operating. 1894 */ 1895 local_irq_save(flags); 1896 queue = &__get_cpu_var(softnet_data); 1897 1898 __get_cpu_var(netdev_rx_stat).total++; 1899 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 1900 if (queue->input_pkt_queue.qlen) { 1901 enqueue: 1902 dev_hold(skb->dev); 1903 __skb_queue_tail(&queue->input_pkt_queue, skb); 1904 local_irq_restore(flags); 1905 return NET_RX_SUCCESS; 1906 } 1907 1908 napi_schedule(&queue->backlog); 1909 goto enqueue; 1910 } 1911 1912 __get_cpu_var(netdev_rx_stat).dropped++; 1913 local_irq_restore(flags); 1914 1915 kfree_skb(skb); 1916 return NET_RX_DROP; 1917 } 1918 1919 int netif_rx_ni(struct sk_buff *skb) 1920 { 1921 int err; 1922 1923 preempt_disable(); 1924 err = netif_rx(skb); 1925 if (local_softirq_pending()) 1926 do_softirq(); 1927 preempt_enable(); 1928 1929 return err; 1930 } 1931 1932 EXPORT_SYMBOL(netif_rx_ni); 1933 1934 static inline struct net_device *skb_bond(struct sk_buff *skb) 1935 { 1936 struct net_device *dev = skb->dev; 1937 1938 if (dev->master) { 1939 if (skb_bond_should_drop(skb)) { 1940 kfree_skb(skb); 1941 return NULL; 1942 } 1943 skb->dev = dev->master; 1944 } 1945 1946 return dev; 1947 } 1948 1949 1950 static void net_tx_action(struct softirq_action *h) 1951 { 1952 struct softnet_data *sd = &__get_cpu_var(softnet_data); 1953 1954 if (sd->completion_queue) { 1955 struct sk_buff *clist; 1956 1957 local_irq_disable(); 1958 clist = sd->completion_queue; 1959 sd->completion_queue = NULL; 1960 local_irq_enable(); 1961 1962 while (clist) { 1963 struct sk_buff *skb = clist; 1964 clist = clist->next; 1965 1966 BUG_TRAP(!atomic_read(&skb->users)); 1967 __kfree_skb(skb); 1968 } 1969 } 1970 1971 if (sd->output_queue) { 1972 struct Qdisc *head; 1973 1974 local_irq_disable(); 1975 head = sd->output_queue; 1976 sd->output_queue = NULL; 1977 local_irq_enable(); 1978 1979 while (head) { 1980 struct Qdisc *q = head; 1981 spinlock_t *root_lock; 1982 1983 head = head->next_sched; 1984 1985 smp_mb__before_clear_bit(); 1986 clear_bit(__QDISC_STATE_SCHED, &q->state); 1987 1988 root_lock = qdisc_root_lock(q); 1989 if (spin_trylock(root_lock)) { 1990 qdisc_run(q); 1991 spin_unlock(root_lock); 1992 } else { 1993 __netif_schedule(q); 1994 } 1995 } 1996 } 1997 } 1998 1999 static inline int deliver_skb(struct sk_buff *skb, 2000 struct packet_type *pt_prev, 2001 struct net_device *orig_dev) 2002 { 2003 atomic_inc(&skb->users); 2004 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2005 } 2006 2007 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) 2008 /* These hooks defined here for ATM */ 2009 struct net_bridge; 2010 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, 2011 unsigned char *addr); 2012 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; 2013 2014 /* 2015 * If bridge module is loaded call bridging hook. 2016 * returns NULL if packet was consumed. 2017 */ 2018 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, 2019 struct sk_buff *skb) __read_mostly; 2020 static inline struct sk_buff *handle_bridge(struct sk_buff *skb, 2021 struct packet_type **pt_prev, int *ret, 2022 struct net_device *orig_dev) 2023 { 2024 struct net_bridge_port *port; 2025 2026 if (skb->pkt_type == PACKET_LOOPBACK || 2027 (port = rcu_dereference(skb->dev->br_port)) == NULL) 2028 return skb; 2029 2030 if (*pt_prev) { 2031 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2032 *pt_prev = NULL; 2033 } 2034 2035 return br_handle_frame_hook(port, skb); 2036 } 2037 #else 2038 #define handle_bridge(skb, pt_prev, ret, orig_dev) (skb) 2039 #endif 2040 2041 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) 2042 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; 2043 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); 2044 2045 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, 2046 struct packet_type **pt_prev, 2047 int *ret, 2048 struct net_device *orig_dev) 2049 { 2050 if (skb->dev->macvlan_port == NULL) 2051 return skb; 2052 2053 if (*pt_prev) { 2054 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2055 *pt_prev = NULL; 2056 } 2057 return macvlan_handle_frame_hook(skb); 2058 } 2059 #else 2060 #define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) 2061 #endif 2062 2063 #ifdef CONFIG_NET_CLS_ACT 2064 /* TODO: Maybe we should just force sch_ingress to be compiled in 2065 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 2066 * a compare and 2 stores extra right now if we dont have it on 2067 * but have CONFIG_NET_CLS_ACT 2068 * NOTE: This doesnt stop any functionality; if you dont have 2069 * the ingress scheduler, you just cant add policies on ingress. 2070 * 2071 */ 2072 static int ing_filter(struct sk_buff *skb) 2073 { 2074 struct net_device *dev = skb->dev; 2075 u32 ttl = G_TC_RTTL(skb->tc_verd); 2076 struct netdev_queue *rxq; 2077 int result = TC_ACT_OK; 2078 struct Qdisc *q; 2079 2080 if (MAX_RED_LOOP < ttl++) { 2081 printk(KERN_WARNING 2082 "Redir loop detected Dropping packet (%d->%d)\n", 2083 skb->iif, dev->ifindex); 2084 return TC_ACT_SHOT; 2085 } 2086 2087 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 2088 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 2089 2090 rxq = &dev->rx_queue; 2091 2092 q = rxq->qdisc; 2093 if (q) { 2094 spin_lock(qdisc_lock(q)); 2095 result = qdisc_enqueue_root(skb, q); 2096 spin_unlock(qdisc_lock(q)); 2097 } 2098 2099 return result; 2100 } 2101 2102 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 2103 struct packet_type **pt_prev, 2104 int *ret, struct net_device *orig_dev) 2105 { 2106 if (!skb->dev->rx_queue.qdisc) 2107 goto out; 2108 2109 if (*pt_prev) { 2110 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2111 *pt_prev = NULL; 2112 } else { 2113 /* Huh? Why does turning on AF_PACKET affect this? */ 2114 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); 2115 } 2116 2117 switch (ing_filter(skb)) { 2118 case TC_ACT_SHOT: 2119 case TC_ACT_STOLEN: 2120 kfree_skb(skb); 2121 return NULL; 2122 } 2123 2124 out: 2125 skb->tc_verd = 0; 2126 return skb; 2127 } 2128 #endif 2129 2130 /* 2131 * netif_nit_deliver - deliver received packets to network taps 2132 * @skb: buffer 2133 * 2134 * This function is used to deliver incoming packets to network 2135 * taps. It should be used when the normal netif_receive_skb path 2136 * is bypassed, for example because of VLAN acceleration. 2137 */ 2138 void netif_nit_deliver(struct sk_buff *skb) 2139 { 2140 struct packet_type *ptype; 2141 2142 if (list_empty(&ptype_all)) 2143 return; 2144 2145 skb_reset_network_header(skb); 2146 skb_reset_transport_header(skb); 2147 skb->mac_len = skb->network_header - skb->mac_header; 2148 2149 rcu_read_lock(); 2150 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2151 if (!ptype->dev || ptype->dev == skb->dev) 2152 deliver_skb(skb, ptype, skb->dev); 2153 } 2154 rcu_read_unlock(); 2155 } 2156 2157 /** 2158 * netif_receive_skb - process receive buffer from network 2159 * @skb: buffer to process 2160 * 2161 * netif_receive_skb() is the main receive data processing function. 2162 * It always succeeds. The buffer may be dropped during processing 2163 * for congestion control or by the protocol layers. 2164 * 2165 * This function may only be called from softirq context and interrupts 2166 * should be enabled. 2167 * 2168 * Return values (usually ignored): 2169 * NET_RX_SUCCESS: no congestion 2170 * NET_RX_DROP: packet was dropped 2171 */ 2172 int netif_receive_skb(struct sk_buff *skb) 2173 { 2174 struct packet_type *ptype, *pt_prev; 2175 struct net_device *orig_dev; 2176 int ret = NET_RX_DROP; 2177 __be16 type; 2178 2179 /* if we've gotten here through NAPI, check netpoll */ 2180 if (netpoll_receive_skb(skb)) 2181 return NET_RX_DROP; 2182 2183 if (!skb->tstamp.tv64) 2184 net_timestamp(skb); 2185 2186 if (!skb->iif) 2187 skb->iif = skb->dev->ifindex; 2188 2189 orig_dev = skb_bond(skb); 2190 2191 if (!orig_dev) 2192 return NET_RX_DROP; 2193 2194 __get_cpu_var(netdev_rx_stat).total++; 2195 2196 skb_reset_network_header(skb); 2197 skb_reset_transport_header(skb); 2198 skb->mac_len = skb->network_header - skb->mac_header; 2199 2200 pt_prev = NULL; 2201 2202 rcu_read_lock(); 2203 2204 /* Don't receive packets in an exiting network namespace */ 2205 if (!net_alive(dev_net(skb->dev))) 2206 goto out; 2207 2208 #ifdef CONFIG_NET_CLS_ACT 2209 if (skb->tc_verd & TC_NCLS) { 2210 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 2211 goto ncls; 2212 } 2213 #endif 2214 2215 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2216 if (!ptype->dev || ptype->dev == skb->dev) { 2217 if (pt_prev) 2218 ret = deliver_skb(skb, pt_prev, orig_dev); 2219 pt_prev = ptype; 2220 } 2221 } 2222 2223 #ifdef CONFIG_NET_CLS_ACT 2224 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 2225 if (!skb) 2226 goto out; 2227 ncls: 2228 #endif 2229 2230 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); 2231 if (!skb) 2232 goto out; 2233 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); 2234 if (!skb) 2235 goto out; 2236 2237 type = skb->protocol; 2238 list_for_each_entry_rcu(ptype, 2239 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 2240 if (ptype->type == type && 2241 (!ptype->dev || ptype->dev == skb->dev)) { 2242 if (pt_prev) 2243 ret = deliver_skb(skb, pt_prev, orig_dev); 2244 pt_prev = ptype; 2245 } 2246 } 2247 2248 if (pt_prev) { 2249 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2250 } else { 2251 kfree_skb(skb); 2252 /* Jamal, now you will not able to escape explaining 2253 * me how you were going to use this. :-) 2254 */ 2255 ret = NET_RX_DROP; 2256 } 2257 2258 out: 2259 rcu_read_unlock(); 2260 return ret; 2261 } 2262 2263 static int process_backlog(struct napi_struct *napi, int quota) 2264 { 2265 int work = 0; 2266 struct softnet_data *queue = &__get_cpu_var(softnet_data); 2267 unsigned long start_time = jiffies; 2268 2269 napi->weight = weight_p; 2270 do { 2271 struct sk_buff *skb; 2272 struct net_device *dev; 2273 2274 local_irq_disable(); 2275 skb = __skb_dequeue(&queue->input_pkt_queue); 2276 if (!skb) { 2277 __napi_complete(napi); 2278 local_irq_enable(); 2279 break; 2280 } 2281 2282 local_irq_enable(); 2283 2284 dev = skb->dev; 2285 2286 netif_receive_skb(skb); 2287 2288 dev_put(dev); 2289 } while (++work < quota && jiffies == start_time); 2290 2291 return work; 2292 } 2293 2294 /** 2295 * __napi_schedule - schedule for receive 2296 * @n: entry to schedule 2297 * 2298 * The entry's receive function will be scheduled to run 2299 */ 2300 void __napi_schedule(struct napi_struct *n) 2301 { 2302 unsigned long flags; 2303 2304 local_irq_save(flags); 2305 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); 2306 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2307 local_irq_restore(flags); 2308 } 2309 EXPORT_SYMBOL(__napi_schedule); 2310 2311 2312 static void net_rx_action(struct softirq_action *h) 2313 { 2314 struct list_head *list = &__get_cpu_var(softnet_data).poll_list; 2315 unsigned long start_time = jiffies; 2316 int budget = netdev_budget; 2317 void *have; 2318 2319 local_irq_disable(); 2320 2321 while (!list_empty(list)) { 2322 struct napi_struct *n; 2323 int work, weight; 2324 2325 /* If softirq window is exhuasted then punt. 2326 * 2327 * Note that this is a slight policy change from the 2328 * previous NAPI code, which would allow up to 2 2329 * jiffies to pass before breaking out. The test 2330 * used to be "jiffies - start_time > 1". 2331 */ 2332 if (unlikely(budget <= 0 || jiffies != start_time)) 2333 goto softnet_break; 2334 2335 local_irq_enable(); 2336 2337 /* Even though interrupts have been re-enabled, this 2338 * access is safe because interrupts can only add new 2339 * entries to the tail of this list, and only ->poll() 2340 * calls can remove this head entry from the list. 2341 */ 2342 n = list_entry(list->next, struct napi_struct, poll_list); 2343 2344 have = netpoll_poll_lock(n); 2345 2346 weight = n->weight; 2347 2348 /* This NAPI_STATE_SCHED test is for avoiding a race 2349 * with netpoll's poll_napi(). Only the entity which 2350 * obtains the lock and sees NAPI_STATE_SCHED set will 2351 * actually make the ->poll() call. Therefore we avoid 2352 * accidently calling ->poll() when NAPI is not scheduled. 2353 */ 2354 work = 0; 2355 if (test_bit(NAPI_STATE_SCHED, &n->state)) 2356 work = n->poll(n, weight); 2357 2358 WARN_ON_ONCE(work > weight); 2359 2360 budget -= work; 2361 2362 local_irq_disable(); 2363 2364 /* Drivers must not modify the NAPI state if they 2365 * consume the entire weight. In such cases this code 2366 * still "owns" the NAPI instance and therefore can 2367 * move the instance around on the list at-will. 2368 */ 2369 if (unlikely(work == weight)) { 2370 if (unlikely(napi_disable_pending(n))) 2371 __napi_complete(n); 2372 else 2373 list_move_tail(&n->poll_list, list); 2374 } 2375 2376 netpoll_poll_unlock(have); 2377 } 2378 out: 2379 local_irq_enable(); 2380 2381 #ifdef CONFIG_NET_DMA 2382 /* 2383 * There may not be any more sk_buffs coming right now, so push 2384 * any pending DMA copies to hardware 2385 */ 2386 if (!cpus_empty(net_dma.channel_mask)) { 2387 int chan_idx; 2388 for_each_cpu_mask(chan_idx, net_dma.channel_mask) { 2389 struct dma_chan *chan = net_dma.channels[chan_idx]; 2390 if (chan) 2391 dma_async_memcpy_issue_pending(chan); 2392 } 2393 } 2394 #endif 2395 2396 return; 2397 2398 softnet_break: 2399 __get_cpu_var(netdev_rx_stat).time_squeeze++; 2400 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2401 goto out; 2402 } 2403 2404 static gifconf_func_t * gifconf_list [NPROTO]; 2405 2406 /** 2407 * register_gifconf - register a SIOCGIF handler 2408 * @family: Address family 2409 * @gifconf: Function handler 2410 * 2411 * Register protocol dependent address dumping routines. The handler 2412 * that is passed must not be freed or reused until it has been replaced 2413 * by another handler. 2414 */ 2415 int register_gifconf(unsigned int family, gifconf_func_t * gifconf) 2416 { 2417 if (family >= NPROTO) 2418 return -EINVAL; 2419 gifconf_list[family] = gifconf; 2420 return 0; 2421 } 2422 2423 2424 /* 2425 * Map an interface index to its name (SIOCGIFNAME) 2426 */ 2427 2428 /* 2429 * We need this ioctl for efficient implementation of the 2430 * if_indextoname() function required by the IPv6 API. Without 2431 * it, we would have to search all the interfaces to find a 2432 * match. --pb 2433 */ 2434 2435 static int dev_ifname(struct net *net, struct ifreq __user *arg) 2436 { 2437 struct net_device *dev; 2438 struct ifreq ifr; 2439 2440 /* 2441 * Fetch the caller's info block. 2442 */ 2443 2444 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 2445 return -EFAULT; 2446 2447 read_lock(&dev_base_lock); 2448 dev = __dev_get_by_index(net, ifr.ifr_ifindex); 2449 if (!dev) { 2450 read_unlock(&dev_base_lock); 2451 return -ENODEV; 2452 } 2453 2454 strcpy(ifr.ifr_name, dev->name); 2455 read_unlock(&dev_base_lock); 2456 2457 if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) 2458 return -EFAULT; 2459 return 0; 2460 } 2461 2462 /* 2463 * Perform a SIOCGIFCONF call. This structure will change 2464 * size eventually, and there is nothing I can do about it. 2465 * Thus we will need a 'compatibility mode'. 2466 */ 2467 2468 static int dev_ifconf(struct net *net, char __user *arg) 2469 { 2470 struct ifconf ifc; 2471 struct net_device *dev; 2472 char __user *pos; 2473 int len; 2474 int total; 2475 int i; 2476 2477 /* 2478 * Fetch the caller's info block. 2479 */ 2480 2481 if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) 2482 return -EFAULT; 2483 2484 pos = ifc.ifc_buf; 2485 len = ifc.ifc_len; 2486 2487 /* 2488 * Loop over the interfaces, and write an info block for each. 2489 */ 2490 2491 total = 0; 2492 for_each_netdev(net, dev) { 2493 for (i = 0; i < NPROTO; i++) { 2494 if (gifconf_list[i]) { 2495 int done; 2496 if (!pos) 2497 done = gifconf_list[i](dev, NULL, 0); 2498 else 2499 done = gifconf_list[i](dev, pos + total, 2500 len - total); 2501 if (done < 0) 2502 return -EFAULT; 2503 total += done; 2504 } 2505 } 2506 } 2507 2508 /* 2509 * All done. Write the updated control block back to the caller. 2510 */ 2511 ifc.ifc_len = total; 2512 2513 /* 2514 * Both BSD and Solaris return 0 here, so we do too. 2515 */ 2516 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; 2517 } 2518 2519 #ifdef CONFIG_PROC_FS 2520 /* 2521 * This is invoked by the /proc filesystem handler to display a device 2522 * in detail. 2523 */ 2524 void *dev_seq_start(struct seq_file *seq, loff_t *pos) 2525 __acquires(dev_base_lock) 2526 { 2527 struct net *net = seq_file_net(seq); 2528 loff_t off; 2529 struct net_device *dev; 2530 2531 read_lock(&dev_base_lock); 2532 if (!*pos) 2533 return SEQ_START_TOKEN; 2534 2535 off = 1; 2536 for_each_netdev(net, dev) 2537 if (off++ == *pos) 2538 return dev; 2539 2540 return NULL; 2541 } 2542 2543 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2544 { 2545 struct net *net = seq_file_net(seq); 2546 ++*pos; 2547 return v == SEQ_START_TOKEN ? 2548 first_net_device(net) : next_net_device((struct net_device *)v); 2549 } 2550 2551 void dev_seq_stop(struct seq_file *seq, void *v) 2552 __releases(dev_base_lock) 2553 { 2554 read_unlock(&dev_base_lock); 2555 } 2556 2557 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) 2558 { 2559 struct net_device_stats *stats = dev->get_stats(dev); 2560 2561 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " 2562 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", 2563 dev->name, stats->rx_bytes, stats->rx_packets, 2564 stats->rx_errors, 2565 stats->rx_dropped + stats->rx_missed_errors, 2566 stats->rx_fifo_errors, 2567 stats->rx_length_errors + stats->rx_over_errors + 2568 stats->rx_crc_errors + stats->rx_frame_errors, 2569 stats->rx_compressed, stats->multicast, 2570 stats->tx_bytes, stats->tx_packets, 2571 stats->tx_errors, stats->tx_dropped, 2572 stats->tx_fifo_errors, stats->collisions, 2573 stats->tx_carrier_errors + 2574 stats->tx_aborted_errors + 2575 stats->tx_window_errors + 2576 stats->tx_heartbeat_errors, 2577 stats->tx_compressed); 2578 } 2579 2580 /* 2581 * Called from the PROCfs module. This now uses the new arbitrary sized 2582 * /proc/net interface to create /proc/net/dev 2583 */ 2584 static int dev_seq_show(struct seq_file *seq, void *v) 2585 { 2586 if (v == SEQ_START_TOKEN) 2587 seq_puts(seq, "Inter-| Receive " 2588 " | Transmit\n" 2589 " face |bytes packets errs drop fifo frame " 2590 "compressed multicast|bytes packets errs " 2591 "drop fifo colls carrier compressed\n"); 2592 else 2593 dev_seq_printf_stats(seq, v); 2594 return 0; 2595 } 2596 2597 static struct netif_rx_stats *softnet_get_online(loff_t *pos) 2598 { 2599 struct netif_rx_stats *rc = NULL; 2600 2601 while (*pos < nr_cpu_ids) 2602 if (cpu_online(*pos)) { 2603 rc = &per_cpu(netdev_rx_stat, *pos); 2604 break; 2605 } else 2606 ++*pos; 2607 return rc; 2608 } 2609 2610 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) 2611 { 2612 return softnet_get_online(pos); 2613 } 2614 2615 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2616 { 2617 ++*pos; 2618 return softnet_get_online(pos); 2619 } 2620 2621 static void softnet_seq_stop(struct seq_file *seq, void *v) 2622 { 2623 } 2624 2625 static int softnet_seq_show(struct seq_file *seq, void *v) 2626 { 2627 struct netif_rx_stats *s = v; 2628 2629 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 2630 s->total, s->dropped, s->time_squeeze, 0, 2631 0, 0, 0, 0, /* was fastroute */ 2632 s->cpu_collision ); 2633 return 0; 2634 } 2635 2636 static const struct seq_operations dev_seq_ops = { 2637 .start = dev_seq_start, 2638 .next = dev_seq_next, 2639 .stop = dev_seq_stop, 2640 .show = dev_seq_show, 2641 }; 2642 2643 static int dev_seq_open(struct inode *inode, struct file *file) 2644 { 2645 return seq_open_net(inode, file, &dev_seq_ops, 2646 sizeof(struct seq_net_private)); 2647 } 2648 2649 static const struct file_operations dev_seq_fops = { 2650 .owner = THIS_MODULE, 2651 .open = dev_seq_open, 2652 .read = seq_read, 2653 .llseek = seq_lseek, 2654 .release = seq_release_net, 2655 }; 2656 2657 static const struct seq_operations softnet_seq_ops = { 2658 .start = softnet_seq_start, 2659 .next = softnet_seq_next, 2660 .stop = softnet_seq_stop, 2661 .show = softnet_seq_show, 2662 }; 2663 2664 static int softnet_seq_open(struct inode *inode, struct file *file) 2665 { 2666 return seq_open(file, &softnet_seq_ops); 2667 } 2668 2669 static const struct file_operations softnet_seq_fops = { 2670 .owner = THIS_MODULE, 2671 .open = softnet_seq_open, 2672 .read = seq_read, 2673 .llseek = seq_lseek, 2674 .release = seq_release, 2675 }; 2676 2677 static void *ptype_get_idx(loff_t pos) 2678 { 2679 struct packet_type *pt = NULL; 2680 loff_t i = 0; 2681 int t; 2682 2683 list_for_each_entry_rcu(pt, &ptype_all, list) { 2684 if (i == pos) 2685 return pt; 2686 ++i; 2687 } 2688 2689 for (t = 0; t < PTYPE_HASH_SIZE; t++) { 2690 list_for_each_entry_rcu(pt, &ptype_base[t], list) { 2691 if (i == pos) 2692 return pt; 2693 ++i; 2694 } 2695 } 2696 return NULL; 2697 } 2698 2699 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) 2700 __acquires(RCU) 2701 { 2702 rcu_read_lock(); 2703 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; 2704 } 2705 2706 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2707 { 2708 struct packet_type *pt; 2709 struct list_head *nxt; 2710 int hash; 2711 2712 ++*pos; 2713 if (v == SEQ_START_TOKEN) 2714 return ptype_get_idx(0); 2715 2716 pt = v; 2717 nxt = pt->list.next; 2718 if (pt->type == htons(ETH_P_ALL)) { 2719 if (nxt != &ptype_all) 2720 goto found; 2721 hash = 0; 2722 nxt = ptype_base[0].next; 2723 } else 2724 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 2725 2726 while (nxt == &ptype_base[hash]) { 2727 if (++hash >= PTYPE_HASH_SIZE) 2728 return NULL; 2729 nxt = ptype_base[hash].next; 2730 } 2731 found: 2732 return list_entry(nxt, struct packet_type, list); 2733 } 2734 2735 static void ptype_seq_stop(struct seq_file *seq, void *v) 2736 __releases(RCU) 2737 { 2738 rcu_read_unlock(); 2739 } 2740 2741 static void ptype_seq_decode(struct seq_file *seq, void *sym) 2742 { 2743 #ifdef CONFIG_KALLSYMS 2744 unsigned long offset = 0, symsize; 2745 const char *symname; 2746 char *modname; 2747 char namebuf[128]; 2748 2749 symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset, 2750 &modname, namebuf); 2751 2752 if (symname) { 2753 char *delim = ":"; 2754 2755 if (!modname) 2756 modname = delim = ""; 2757 seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim, 2758 symname, offset); 2759 return; 2760 } 2761 #endif 2762 2763 seq_printf(seq, "[%p]", sym); 2764 } 2765 2766 static int ptype_seq_show(struct seq_file *seq, void *v) 2767 { 2768 struct packet_type *pt = v; 2769 2770 if (v == SEQ_START_TOKEN) 2771 seq_puts(seq, "Type Device Function\n"); 2772 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { 2773 if (pt->type == htons(ETH_P_ALL)) 2774 seq_puts(seq, "ALL "); 2775 else 2776 seq_printf(seq, "%04x", ntohs(pt->type)); 2777 2778 seq_printf(seq, " %-8s ", 2779 pt->dev ? pt->dev->name : ""); 2780 ptype_seq_decode(seq, pt->func); 2781 seq_putc(seq, '\n'); 2782 } 2783 2784 return 0; 2785 } 2786 2787 static const struct seq_operations ptype_seq_ops = { 2788 .start = ptype_seq_start, 2789 .next = ptype_seq_next, 2790 .stop = ptype_seq_stop, 2791 .show = ptype_seq_show, 2792 }; 2793 2794 static int ptype_seq_open(struct inode *inode, struct file *file) 2795 { 2796 return seq_open_net(inode, file, &ptype_seq_ops, 2797 sizeof(struct seq_net_private)); 2798 } 2799 2800 static const struct file_operations ptype_seq_fops = { 2801 .owner = THIS_MODULE, 2802 .open = ptype_seq_open, 2803 .read = seq_read, 2804 .llseek = seq_lseek, 2805 .release = seq_release_net, 2806 }; 2807 2808 2809 static int __net_init dev_proc_net_init(struct net *net) 2810 { 2811 int rc = -ENOMEM; 2812 2813 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) 2814 goto out; 2815 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) 2816 goto out_dev; 2817 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) 2818 goto out_softnet; 2819 2820 if (wext_proc_init(net)) 2821 goto out_ptype; 2822 rc = 0; 2823 out: 2824 return rc; 2825 out_ptype: 2826 proc_net_remove(net, "ptype"); 2827 out_softnet: 2828 proc_net_remove(net, "softnet_stat"); 2829 out_dev: 2830 proc_net_remove(net, "dev"); 2831 goto out; 2832 } 2833 2834 static void __net_exit dev_proc_net_exit(struct net *net) 2835 { 2836 wext_proc_exit(net); 2837 2838 proc_net_remove(net, "ptype"); 2839 proc_net_remove(net, "softnet_stat"); 2840 proc_net_remove(net, "dev"); 2841 } 2842 2843 static struct pernet_operations __net_initdata dev_proc_ops = { 2844 .init = dev_proc_net_init, 2845 .exit = dev_proc_net_exit, 2846 }; 2847 2848 static int __init dev_proc_init(void) 2849 { 2850 return register_pernet_subsys(&dev_proc_ops); 2851 } 2852 #else 2853 #define dev_proc_init() 0 2854 #endif /* CONFIG_PROC_FS */ 2855 2856 2857 /** 2858 * netdev_set_master - set up master/slave pair 2859 * @slave: slave device 2860 * @master: new master device 2861 * 2862 * Changes the master device of the slave. Pass %NULL to break the 2863 * bonding. The caller must hold the RTNL semaphore. On a failure 2864 * a negative errno code is returned. On success the reference counts 2865 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the 2866 * function returns zero. 2867 */ 2868 int netdev_set_master(struct net_device *slave, struct net_device *master) 2869 { 2870 struct net_device *old = slave->master; 2871 2872 ASSERT_RTNL(); 2873 2874 if (master) { 2875 if (old) 2876 return -EBUSY; 2877 dev_hold(master); 2878 } 2879 2880 slave->master = master; 2881 2882 synchronize_net(); 2883 2884 if (old) 2885 dev_put(old); 2886 2887 if (master) 2888 slave->flags |= IFF_SLAVE; 2889 else 2890 slave->flags &= ~IFF_SLAVE; 2891 2892 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); 2893 return 0; 2894 } 2895 2896 static int __dev_set_promiscuity(struct net_device *dev, int inc) 2897 { 2898 unsigned short old_flags = dev->flags; 2899 2900 ASSERT_RTNL(); 2901 2902 dev->flags |= IFF_PROMISC; 2903 dev->promiscuity += inc; 2904 if (dev->promiscuity == 0) { 2905 /* 2906 * Avoid overflow. 2907 * If inc causes overflow, untouch promisc and return error. 2908 */ 2909 if (inc < 0) 2910 dev->flags &= ~IFF_PROMISC; 2911 else { 2912 dev->promiscuity -= inc; 2913 printk(KERN_WARNING "%s: promiscuity touches roof, " 2914 "set promiscuity failed, promiscuity feature " 2915 "of device might be broken.\n", dev->name); 2916 return -EOVERFLOW; 2917 } 2918 } 2919 if (dev->flags != old_flags) { 2920 printk(KERN_INFO "device %s %s promiscuous mode\n", 2921 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 2922 "left"); 2923 if (audit_enabled) 2924 audit_log(current->audit_context, GFP_ATOMIC, 2925 AUDIT_ANOM_PROMISCUOUS, 2926 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 2927 dev->name, (dev->flags & IFF_PROMISC), 2928 (old_flags & IFF_PROMISC), 2929 audit_get_loginuid(current), 2930 current->uid, current->gid, 2931 audit_get_sessionid(current)); 2932 2933 if (dev->change_rx_flags) 2934 dev->change_rx_flags(dev, IFF_PROMISC); 2935 } 2936 return 0; 2937 } 2938 2939 /** 2940 * dev_set_promiscuity - update promiscuity count on a device 2941 * @dev: device 2942 * @inc: modifier 2943 * 2944 * Add or remove promiscuity from a device. While the count in the device 2945 * remains above zero the interface remains promiscuous. Once it hits zero 2946 * the device reverts back to normal filtering operation. A negative inc 2947 * value is used to drop promiscuity on the device. 2948 * Return 0 if successful or a negative errno code on error. 2949 */ 2950 int dev_set_promiscuity(struct net_device *dev, int inc) 2951 { 2952 unsigned short old_flags = dev->flags; 2953 int err; 2954 2955 err = __dev_set_promiscuity(dev, inc); 2956 if (err < 0) 2957 return err; 2958 if (dev->flags != old_flags) 2959 dev_set_rx_mode(dev); 2960 return err; 2961 } 2962 2963 /** 2964 * dev_set_allmulti - update allmulti count on a device 2965 * @dev: device 2966 * @inc: modifier 2967 * 2968 * Add or remove reception of all multicast frames to a device. While the 2969 * count in the device remains above zero the interface remains listening 2970 * to all interfaces. Once it hits zero the device reverts back to normal 2971 * filtering operation. A negative @inc value is used to drop the counter 2972 * when releasing a resource needing all multicasts. 2973 * Return 0 if successful or a negative errno code on error. 2974 */ 2975 2976 int dev_set_allmulti(struct net_device *dev, int inc) 2977 { 2978 unsigned short old_flags = dev->flags; 2979 2980 ASSERT_RTNL(); 2981 2982 dev->flags |= IFF_ALLMULTI; 2983 dev->allmulti += inc; 2984 if (dev->allmulti == 0) { 2985 /* 2986 * Avoid overflow. 2987 * If inc causes overflow, untouch allmulti and return error. 2988 */ 2989 if (inc < 0) 2990 dev->flags &= ~IFF_ALLMULTI; 2991 else { 2992 dev->allmulti -= inc; 2993 printk(KERN_WARNING "%s: allmulti touches roof, " 2994 "set allmulti failed, allmulti feature of " 2995 "device might be broken.\n", dev->name); 2996 return -EOVERFLOW; 2997 } 2998 } 2999 if (dev->flags ^ old_flags) { 3000 if (dev->change_rx_flags) 3001 dev->change_rx_flags(dev, IFF_ALLMULTI); 3002 dev_set_rx_mode(dev); 3003 } 3004 return 0; 3005 } 3006 3007 /* 3008 * Upload unicast and multicast address lists to device and 3009 * configure RX filtering. When the device doesn't support unicast 3010 * filtering it is put in promiscuous mode while unicast addresses 3011 * are present. 3012 */ 3013 void __dev_set_rx_mode(struct net_device *dev) 3014 { 3015 /* dev_open will call this function so the list will stay sane. */ 3016 if (!(dev->flags&IFF_UP)) 3017 return; 3018 3019 if (!netif_device_present(dev)) 3020 return; 3021 3022 if (dev->set_rx_mode) 3023 dev->set_rx_mode(dev); 3024 else { 3025 /* Unicast addresses changes may only happen under the rtnl, 3026 * therefore calling __dev_set_promiscuity here is safe. 3027 */ 3028 if (dev->uc_count > 0 && !dev->uc_promisc) { 3029 __dev_set_promiscuity(dev, 1); 3030 dev->uc_promisc = 1; 3031 } else if (dev->uc_count == 0 && dev->uc_promisc) { 3032 __dev_set_promiscuity(dev, -1); 3033 dev->uc_promisc = 0; 3034 } 3035 3036 if (dev->set_multicast_list) 3037 dev->set_multicast_list(dev); 3038 } 3039 } 3040 3041 void dev_set_rx_mode(struct net_device *dev) 3042 { 3043 netif_addr_lock_bh(dev); 3044 __dev_set_rx_mode(dev); 3045 netif_addr_unlock_bh(dev); 3046 } 3047 3048 int __dev_addr_delete(struct dev_addr_list **list, int *count, 3049 void *addr, int alen, int glbl) 3050 { 3051 struct dev_addr_list *da; 3052 3053 for (; (da = *list) != NULL; list = &da->next) { 3054 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && 3055 alen == da->da_addrlen) { 3056 if (glbl) { 3057 int old_glbl = da->da_gusers; 3058 da->da_gusers = 0; 3059 if (old_glbl == 0) 3060 break; 3061 } 3062 if (--da->da_users) 3063 return 0; 3064 3065 *list = da->next; 3066 kfree(da); 3067 (*count)--; 3068 return 0; 3069 } 3070 } 3071 return -ENOENT; 3072 } 3073 3074 int __dev_addr_add(struct dev_addr_list **list, int *count, 3075 void *addr, int alen, int glbl) 3076 { 3077 struct dev_addr_list *da; 3078 3079 for (da = *list; da != NULL; da = da->next) { 3080 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && 3081 da->da_addrlen == alen) { 3082 if (glbl) { 3083 int old_glbl = da->da_gusers; 3084 da->da_gusers = 1; 3085 if (old_glbl) 3086 return 0; 3087 } 3088 da->da_users++; 3089 return 0; 3090 } 3091 } 3092 3093 da = kzalloc(sizeof(*da), GFP_ATOMIC); 3094 if (da == NULL) 3095 return -ENOMEM; 3096 memcpy(da->da_addr, addr, alen); 3097 da->da_addrlen = alen; 3098 da->da_users = 1; 3099 da->da_gusers = glbl ? 1 : 0; 3100 da->next = *list; 3101 *list = da; 3102 (*count)++; 3103 return 0; 3104 } 3105 3106 /** 3107 * dev_unicast_delete - Release secondary unicast address. 3108 * @dev: device 3109 * @addr: address to delete 3110 * @alen: length of @addr 3111 * 3112 * Release reference to a secondary unicast address and remove it 3113 * from the device if the reference count drops to zero. 3114 * 3115 * The caller must hold the rtnl_mutex. 3116 */ 3117 int dev_unicast_delete(struct net_device *dev, void *addr, int alen) 3118 { 3119 int err; 3120 3121 ASSERT_RTNL(); 3122 3123 netif_addr_lock_bh(dev); 3124 err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0); 3125 if (!err) 3126 __dev_set_rx_mode(dev); 3127 netif_addr_unlock_bh(dev); 3128 return err; 3129 } 3130 EXPORT_SYMBOL(dev_unicast_delete); 3131 3132 /** 3133 * dev_unicast_add - add a secondary unicast address 3134 * @dev: device 3135 * @addr: address to add 3136 * @alen: length of @addr 3137 * 3138 * Add a secondary unicast address to the device or increase 3139 * the reference count if it already exists. 3140 * 3141 * The caller must hold the rtnl_mutex. 3142 */ 3143 int dev_unicast_add(struct net_device *dev, void *addr, int alen) 3144 { 3145 int err; 3146 3147 ASSERT_RTNL(); 3148 3149 netif_addr_lock_bh(dev); 3150 err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0); 3151 if (!err) 3152 __dev_set_rx_mode(dev); 3153 netif_addr_unlock_bh(dev); 3154 return err; 3155 } 3156 EXPORT_SYMBOL(dev_unicast_add); 3157 3158 int __dev_addr_sync(struct dev_addr_list **to, int *to_count, 3159 struct dev_addr_list **from, int *from_count) 3160 { 3161 struct dev_addr_list *da, *next; 3162 int err = 0; 3163 3164 da = *from; 3165 while (da != NULL) { 3166 next = da->next; 3167 if (!da->da_synced) { 3168 err = __dev_addr_add(to, to_count, 3169 da->da_addr, da->da_addrlen, 0); 3170 if (err < 0) 3171 break; 3172 da->da_synced = 1; 3173 da->da_users++; 3174 } else if (da->da_users == 1) { 3175 __dev_addr_delete(to, to_count, 3176 da->da_addr, da->da_addrlen, 0); 3177 __dev_addr_delete(from, from_count, 3178 da->da_addr, da->da_addrlen, 0); 3179 } 3180 da = next; 3181 } 3182 return err; 3183 } 3184 3185 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, 3186 struct dev_addr_list **from, int *from_count) 3187 { 3188 struct dev_addr_list *da, *next; 3189 3190 da = *from; 3191 while (da != NULL) { 3192 next = da->next; 3193 if (da->da_synced) { 3194 __dev_addr_delete(to, to_count, 3195 da->da_addr, da->da_addrlen, 0); 3196 da->da_synced = 0; 3197 __dev_addr_delete(from, from_count, 3198 da->da_addr, da->da_addrlen, 0); 3199 } 3200 da = next; 3201 } 3202 } 3203 3204 /** 3205 * dev_unicast_sync - Synchronize device's unicast list to another device 3206 * @to: destination device 3207 * @from: source device 3208 * 3209 * Add newly added addresses to the destination device and release 3210 * addresses that have no users left. The source device must be 3211 * locked by netif_tx_lock_bh. 3212 * 3213 * This function is intended to be called from the dev->set_rx_mode 3214 * function of layered software devices. 3215 */ 3216 int dev_unicast_sync(struct net_device *to, struct net_device *from) 3217 { 3218 int err = 0; 3219 3220 netif_addr_lock_bh(to); 3221 err = __dev_addr_sync(&to->uc_list, &to->uc_count, 3222 &from->uc_list, &from->uc_count); 3223 if (!err) 3224 __dev_set_rx_mode(to); 3225 netif_addr_unlock_bh(to); 3226 return err; 3227 } 3228 EXPORT_SYMBOL(dev_unicast_sync); 3229 3230 /** 3231 * dev_unicast_unsync - Remove synchronized addresses from the destination device 3232 * @to: destination device 3233 * @from: source device 3234 * 3235 * Remove all addresses that were added to the destination device by 3236 * dev_unicast_sync(). This function is intended to be called from the 3237 * dev->stop function of layered software devices. 3238 */ 3239 void dev_unicast_unsync(struct net_device *to, struct net_device *from) 3240 { 3241 netif_addr_lock_bh(from); 3242 netif_addr_lock(to); 3243 3244 __dev_addr_unsync(&to->uc_list, &to->uc_count, 3245 &from->uc_list, &from->uc_count); 3246 __dev_set_rx_mode(to); 3247 3248 netif_addr_unlock(to); 3249 netif_addr_unlock_bh(from); 3250 } 3251 EXPORT_SYMBOL(dev_unicast_unsync); 3252 3253 static void __dev_addr_discard(struct dev_addr_list **list) 3254 { 3255 struct dev_addr_list *tmp; 3256 3257 while (*list != NULL) { 3258 tmp = *list; 3259 *list = tmp->next; 3260 if (tmp->da_users > tmp->da_gusers) 3261 printk("__dev_addr_discard: address leakage! " 3262 "da_users=%d\n", tmp->da_users); 3263 kfree(tmp); 3264 } 3265 } 3266 3267 static void dev_addr_discard(struct net_device *dev) 3268 { 3269 netif_addr_lock_bh(dev); 3270 3271 __dev_addr_discard(&dev->uc_list); 3272 dev->uc_count = 0; 3273 3274 __dev_addr_discard(&dev->mc_list); 3275 dev->mc_count = 0; 3276 3277 netif_addr_unlock_bh(dev); 3278 } 3279 3280 unsigned dev_get_flags(const struct net_device *dev) 3281 { 3282 unsigned flags; 3283 3284 flags = (dev->flags & ~(IFF_PROMISC | 3285 IFF_ALLMULTI | 3286 IFF_RUNNING | 3287 IFF_LOWER_UP | 3288 IFF_DORMANT)) | 3289 (dev->gflags & (IFF_PROMISC | 3290 IFF_ALLMULTI)); 3291 3292 if (netif_running(dev)) { 3293 if (netif_oper_up(dev)) 3294 flags |= IFF_RUNNING; 3295 if (netif_carrier_ok(dev)) 3296 flags |= IFF_LOWER_UP; 3297 if (netif_dormant(dev)) 3298 flags |= IFF_DORMANT; 3299 } 3300 3301 return flags; 3302 } 3303 3304 int dev_change_flags(struct net_device *dev, unsigned flags) 3305 { 3306 int ret, changes; 3307 int old_flags = dev->flags; 3308 3309 ASSERT_RTNL(); 3310 3311 /* 3312 * Set the flags on our device. 3313 */ 3314 3315 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 3316 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 3317 IFF_AUTOMEDIA)) | 3318 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 3319 IFF_ALLMULTI)); 3320 3321 /* 3322 * Load in the correct multicast list now the flags have changed. 3323 */ 3324 3325 if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST) 3326 dev->change_rx_flags(dev, IFF_MULTICAST); 3327 3328 dev_set_rx_mode(dev); 3329 3330 /* 3331 * Have we downed the interface. We handle IFF_UP ourselves 3332 * according to user attempts to set it, rather than blindly 3333 * setting it. 3334 */ 3335 3336 ret = 0; 3337 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ 3338 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); 3339 3340 if (!ret) 3341 dev_set_rx_mode(dev); 3342 } 3343 3344 if (dev->flags & IFF_UP && 3345 ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | 3346 IFF_VOLATILE))) 3347 call_netdevice_notifiers(NETDEV_CHANGE, dev); 3348 3349 if ((flags ^ dev->gflags) & IFF_PROMISC) { 3350 int inc = (flags & IFF_PROMISC) ? +1 : -1; 3351 dev->gflags ^= IFF_PROMISC; 3352 dev_set_promiscuity(dev, inc); 3353 } 3354 3355 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 3356 is important. Some (broken) drivers set IFF_PROMISC, when 3357 IFF_ALLMULTI is requested not asking us and not reporting. 3358 */ 3359 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 3360 int inc = (flags & IFF_ALLMULTI) ? +1 : -1; 3361 dev->gflags ^= IFF_ALLMULTI; 3362 dev_set_allmulti(dev, inc); 3363 } 3364 3365 /* Exclude state transition flags, already notified */ 3366 changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); 3367 if (changes) 3368 rtmsg_ifinfo(RTM_NEWLINK, dev, changes); 3369 3370 return ret; 3371 } 3372 3373 int dev_set_mtu(struct net_device *dev, int new_mtu) 3374 { 3375 int err; 3376 3377 if (new_mtu == dev->mtu) 3378 return 0; 3379 3380 /* MTU must be positive. */ 3381 if (new_mtu < 0) 3382 return -EINVAL; 3383 3384 if (!netif_device_present(dev)) 3385 return -ENODEV; 3386 3387 err = 0; 3388 if (dev->change_mtu) 3389 err = dev->change_mtu(dev, new_mtu); 3390 else 3391 dev->mtu = new_mtu; 3392 if (!err && dev->flags & IFF_UP) 3393 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 3394 return err; 3395 } 3396 3397 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 3398 { 3399 int err; 3400 3401 if (!dev->set_mac_address) 3402 return -EOPNOTSUPP; 3403 if (sa->sa_family != dev->type) 3404 return -EINVAL; 3405 if (!netif_device_present(dev)) 3406 return -ENODEV; 3407 err = dev->set_mac_address(dev, sa); 3408 if (!err) 3409 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 3410 return err; 3411 } 3412 3413 /* 3414 * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) 3415 */ 3416 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) 3417 { 3418 int err; 3419 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 3420 3421 if (!dev) 3422 return -ENODEV; 3423 3424 switch (cmd) { 3425 case SIOCGIFFLAGS: /* Get interface flags */ 3426 ifr->ifr_flags = dev_get_flags(dev); 3427 return 0; 3428 3429 case SIOCGIFMETRIC: /* Get the metric on the interface 3430 (currently unused) */ 3431 ifr->ifr_metric = 0; 3432 return 0; 3433 3434 case SIOCGIFMTU: /* Get the MTU of a device */ 3435 ifr->ifr_mtu = dev->mtu; 3436 return 0; 3437 3438 case SIOCGIFHWADDR: 3439 if (!dev->addr_len) 3440 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); 3441 else 3442 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, 3443 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 3444 ifr->ifr_hwaddr.sa_family = dev->type; 3445 return 0; 3446 3447 case SIOCGIFSLAVE: 3448 err = -EINVAL; 3449 break; 3450 3451 case SIOCGIFMAP: 3452 ifr->ifr_map.mem_start = dev->mem_start; 3453 ifr->ifr_map.mem_end = dev->mem_end; 3454 ifr->ifr_map.base_addr = dev->base_addr; 3455 ifr->ifr_map.irq = dev->irq; 3456 ifr->ifr_map.dma = dev->dma; 3457 ifr->ifr_map.port = dev->if_port; 3458 return 0; 3459 3460 case SIOCGIFINDEX: 3461 ifr->ifr_ifindex = dev->ifindex; 3462 return 0; 3463 3464 case SIOCGIFTXQLEN: 3465 ifr->ifr_qlen = dev->tx_queue_len; 3466 return 0; 3467 3468 default: 3469 /* dev_ioctl() should ensure this case 3470 * is never reached 3471 */ 3472 WARN_ON(1); 3473 err = -EINVAL; 3474 break; 3475 3476 } 3477 return err; 3478 } 3479 3480 /* 3481 * Perform the SIOCxIFxxx calls, inside rtnl_lock() 3482 */ 3483 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) 3484 { 3485 int err; 3486 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 3487 3488 if (!dev) 3489 return -ENODEV; 3490 3491 switch (cmd) { 3492 case SIOCSIFFLAGS: /* Set interface flags */ 3493 return dev_change_flags(dev, ifr->ifr_flags); 3494 3495 case SIOCSIFMETRIC: /* Set the metric on the interface 3496 (currently unused) */ 3497 return -EOPNOTSUPP; 3498 3499 case SIOCSIFMTU: /* Set the MTU of a device */ 3500 return dev_set_mtu(dev, ifr->ifr_mtu); 3501 3502 case SIOCSIFHWADDR: 3503 return dev_set_mac_address(dev, &ifr->ifr_hwaddr); 3504 3505 case SIOCSIFHWBROADCAST: 3506 if (ifr->ifr_hwaddr.sa_family != dev->type) 3507 return -EINVAL; 3508 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, 3509 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 3510 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 3511 return 0; 3512 3513 case SIOCSIFMAP: 3514 if (dev->set_config) { 3515 if (!netif_device_present(dev)) 3516 return -ENODEV; 3517 return dev->set_config(dev, &ifr->ifr_map); 3518 } 3519 return -EOPNOTSUPP; 3520 3521 case SIOCADDMULTI: 3522 if ((!dev->set_multicast_list && !dev->set_rx_mode) || 3523 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 3524 return -EINVAL; 3525 if (!netif_device_present(dev)) 3526 return -ENODEV; 3527 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, 3528 dev->addr_len, 1); 3529 3530 case SIOCDELMULTI: 3531 if ((!dev->set_multicast_list && !dev->set_rx_mode) || 3532 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 3533 return -EINVAL; 3534 if (!netif_device_present(dev)) 3535 return -ENODEV; 3536 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, 3537 dev->addr_len, 1); 3538 3539 case SIOCSIFTXQLEN: 3540 if (ifr->ifr_qlen < 0) 3541 return -EINVAL; 3542 dev->tx_queue_len = ifr->ifr_qlen; 3543 return 0; 3544 3545 case SIOCSIFNAME: 3546 ifr->ifr_newname[IFNAMSIZ-1] = '\0'; 3547 return dev_change_name(dev, ifr->ifr_newname); 3548 3549 /* 3550 * Unknown or private ioctl 3551 */ 3552 3553 default: 3554 if ((cmd >= SIOCDEVPRIVATE && 3555 cmd <= SIOCDEVPRIVATE + 15) || 3556 cmd == SIOCBONDENSLAVE || 3557 cmd == SIOCBONDRELEASE || 3558 cmd == SIOCBONDSETHWADDR || 3559 cmd == SIOCBONDSLAVEINFOQUERY || 3560 cmd == SIOCBONDINFOQUERY || 3561 cmd == SIOCBONDCHANGEACTIVE || 3562 cmd == SIOCGMIIPHY || 3563 cmd == SIOCGMIIREG || 3564 cmd == SIOCSMIIREG || 3565 cmd == SIOCBRADDIF || 3566 cmd == SIOCBRDELIF || 3567 cmd == SIOCWANDEV) { 3568 err = -EOPNOTSUPP; 3569 if (dev->do_ioctl) { 3570 if (netif_device_present(dev)) 3571 err = dev->do_ioctl(dev, ifr, 3572 cmd); 3573 else 3574 err = -ENODEV; 3575 } 3576 } else 3577 err = -EINVAL; 3578 3579 } 3580 return err; 3581 } 3582 3583 /* 3584 * This function handles all "interface"-type I/O control requests. The actual 3585 * 'doing' part of this is dev_ifsioc above. 3586 */ 3587 3588 /** 3589 * dev_ioctl - network device ioctl 3590 * @net: the applicable net namespace 3591 * @cmd: command to issue 3592 * @arg: pointer to a struct ifreq in user space 3593 * 3594 * Issue ioctl functions to devices. This is normally called by the 3595 * user space syscall interfaces but can sometimes be useful for 3596 * other purposes. The return value is the return from the syscall if 3597 * positive or a negative errno code on error. 3598 */ 3599 3600 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3601 { 3602 struct ifreq ifr; 3603 int ret; 3604 char *colon; 3605 3606 /* One special case: SIOCGIFCONF takes ifconf argument 3607 and requires shared lock, because it sleeps writing 3608 to user space. 3609 */ 3610 3611 if (cmd == SIOCGIFCONF) { 3612 rtnl_lock(); 3613 ret = dev_ifconf(net, (char __user *) arg); 3614 rtnl_unlock(); 3615 return ret; 3616 } 3617 if (cmd == SIOCGIFNAME) 3618 return dev_ifname(net, (struct ifreq __user *)arg); 3619 3620 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 3621 return -EFAULT; 3622 3623 ifr.ifr_name[IFNAMSIZ-1] = 0; 3624 3625 colon = strchr(ifr.ifr_name, ':'); 3626 if (colon) 3627 *colon = 0; 3628 3629 /* 3630 * See which interface the caller is talking about. 3631 */ 3632 3633 switch (cmd) { 3634 /* 3635 * These ioctl calls: 3636 * - can be done by all. 3637 * - atomic and do not require locking. 3638 * - return a value 3639 */ 3640 case SIOCGIFFLAGS: 3641 case SIOCGIFMETRIC: 3642 case SIOCGIFMTU: 3643 case SIOCGIFHWADDR: 3644 case SIOCGIFSLAVE: 3645 case SIOCGIFMAP: 3646 case SIOCGIFINDEX: 3647 case SIOCGIFTXQLEN: 3648 dev_load(net, ifr.ifr_name); 3649 read_lock(&dev_base_lock); 3650 ret = dev_ifsioc_locked(net, &ifr, cmd); 3651 read_unlock(&dev_base_lock); 3652 if (!ret) { 3653 if (colon) 3654 *colon = ':'; 3655 if (copy_to_user(arg, &ifr, 3656 sizeof(struct ifreq))) 3657 ret = -EFAULT; 3658 } 3659 return ret; 3660 3661 case SIOCETHTOOL: 3662 dev_load(net, ifr.ifr_name); 3663 rtnl_lock(); 3664 ret = dev_ethtool(net, &ifr); 3665 rtnl_unlock(); 3666 if (!ret) { 3667 if (colon) 3668 *colon = ':'; 3669 if (copy_to_user(arg, &ifr, 3670 sizeof(struct ifreq))) 3671 ret = -EFAULT; 3672 } 3673 return ret; 3674 3675 /* 3676 * These ioctl calls: 3677 * - require superuser power. 3678 * - require strict serialization. 3679 * - return a value 3680 */ 3681 case SIOCGMIIPHY: 3682 case SIOCGMIIREG: 3683 case SIOCSIFNAME: 3684 if (!capable(CAP_NET_ADMIN)) 3685 return -EPERM; 3686 dev_load(net, ifr.ifr_name); 3687 rtnl_lock(); 3688 ret = dev_ifsioc(net, &ifr, cmd); 3689 rtnl_unlock(); 3690 if (!ret) { 3691 if (colon) 3692 *colon = ':'; 3693 if (copy_to_user(arg, &ifr, 3694 sizeof(struct ifreq))) 3695 ret = -EFAULT; 3696 } 3697 return ret; 3698 3699 /* 3700 * These ioctl calls: 3701 * - require superuser power. 3702 * - require strict serialization. 3703 * - do not return a value 3704 */ 3705 case SIOCSIFFLAGS: 3706 case SIOCSIFMETRIC: 3707 case SIOCSIFMTU: 3708 case SIOCSIFMAP: 3709 case SIOCSIFHWADDR: 3710 case SIOCSIFSLAVE: 3711 case SIOCADDMULTI: 3712 case SIOCDELMULTI: 3713 case SIOCSIFHWBROADCAST: 3714 case SIOCSIFTXQLEN: 3715 case SIOCSMIIREG: 3716 case SIOCBONDENSLAVE: 3717 case SIOCBONDRELEASE: 3718 case SIOCBONDSETHWADDR: 3719 case SIOCBONDCHANGEACTIVE: 3720 case SIOCBRADDIF: 3721 case SIOCBRDELIF: 3722 if (!capable(CAP_NET_ADMIN)) 3723 return -EPERM; 3724 /* fall through */ 3725 case SIOCBONDSLAVEINFOQUERY: 3726 case SIOCBONDINFOQUERY: 3727 dev_load(net, ifr.ifr_name); 3728 rtnl_lock(); 3729 ret = dev_ifsioc(net, &ifr, cmd); 3730 rtnl_unlock(); 3731 return ret; 3732 3733 case SIOCGIFMEM: 3734 /* Get the per device memory space. We can add this but 3735 * currently do not support it */ 3736 case SIOCSIFMEM: 3737 /* Set the per device memory buffer space. 3738 * Not applicable in our case */ 3739 case SIOCSIFLINK: 3740 return -EINVAL; 3741 3742 /* 3743 * Unknown or private ioctl. 3744 */ 3745 default: 3746 if (cmd == SIOCWANDEV || 3747 (cmd >= SIOCDEVPRIVATE && 3748 cmd <= SIOCDEVPRIVATE + 15)) { 3749 dev_load(net, ifr.ifr_name); 3750 rtnl_lock(); 3751 ret = dev_ifsioc(net, &ifr, cmd); 3752 rtnl_unlock(); 3753 if (!ret && copy_to_user(arg, &ifr, 3754 sizeof(struct ifreq))) 3755 ret = -EFAULT; 3756 return ret; 3757 } 3758 /* Take care of Wireless Extensions */ 3759 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) 3760 return wext_handle_ioctl(net, &ifr, cmd, arg); 3761 return -EINVAL; 3762 } 3763 } 3764 3765 3766 /** 3767 * dev_new_index - allocate an ifindex 3768 * @net: the applicable net namespace 3769 * 3770 * Returns a suitable unique value for a new device interface 3771 * number. The caller must hold the rtnl semaphore or the 3772 * dev_base_lock to be sure it remains unique. 3773 */ 3774 static int dev_new_index(struct net *net) 3775 { 3776 static int ifindex; 3777 for (;;) { 3778 if (++ifindex <= 0) 3779 ifindex = 1; 3780 if (!__dev_get_by_index(net, ifindex)) 3781 return ifindex; 3782 } 3783 } 3784 3785 /* Delayed registration/unregisteration */ 3786 static DEFINE_SPINLOCK(net_todo_list_lock); 3787 static LIST_HEAD(net_todo_list); 3788 3789 static void net_set_todo(struct net_device *dev) 3790 { 3791 spin_lock(&net_todo_list_lock); 3792 list_add_tail(&dev->todo_list, &net_todo_list); 3793 spin_unlock(&net_todo_list_lock); 3794 } 3795 3796 static void rollback_registered(struct net_device *dev) 3797 { 3798 BUG_ON(dev_boot_phase); 3799 ASSERT_RTNL(); 3800 3801 /* Some devices call without registering for initialization unwind. */ 3802 if (dev->reg_state == NETREG_UNINITIALIZED) { 3803 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " 3804 "was registered\n", dev->name, dev); 3805 3806 WARN_ON(1); 3807 return; 3808 } 3809 3810 BUG_ON(dev->reg_state != NETREG_REGISTERED); 3811 3812 /* If device is running, close it first. */ 3813 dev_close(dev); 3814 3815 /* And unlink it from device chain. */ 3816 unlist_netdevice(dev); 3817 3818 dev->reg_state = NETREG_UNREGISTERING; 3819 3820 synchronize_net(); 3821 3822 /* Shutdown queueing discipline. */ 3823 dev_shutdown(dev); 3824 3825 3826 /* Notify protocols, that we are about to destroy 3827 this device. They should clean all the things. 3828 */ 3829 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 3830 3831 /* 3832 * Flush the unicast and multicast chains 3833 */ 3834 dev_addr_discard(dev); 3835 3836 if (dev->uninit) 3837 dev->uninit(dev); 3838 3839 /* Notifier chain MUST detach us from master device. */ 3840 BUG_TRAP(!dev->master); 3841 3842 /* Remove entries from kobject tree */ 3843 netdev_unregister_kobject(dev); 3844 3845 synchronize_net(); 3846 3847 dev_put(dev); 3848 } 3849 3850 static void __netdev_init_queue_locks_one(struct net_device *dev, 3851 struct netdev_queue *dev_queue, 3852 void *_unused) 3853 { 3854 spin_lock_init(&dev_queue->_xmit_lock); 3855 netdev_set_lockdep_class(&dev_queue->_xmit_lock, dev->type); 3856 dev_queue->xmit_lock_owner = -1; 3857 } 3858 3859 static void netdev_init_queue_locks(struct net_device *dev) 3860 { 3861 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); 3862 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); 3863 } 3864 3865 /** 3866 * register_netdevice - register a network device 3867 * @dev: device to register 3868 * 3869 * Take a completed network device structure and add it to the kernel 3870 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 3871 * chain. 0 is returned on success. A negative errno code is returned 3872 * on a failure to set up the device, or if the name is a duplicate. 3873 * 3874 * Callers must hold the rtnl semaphore. You may want 3875 * register_netdev() instead of this. 3876 * 3877 * BUGS: 3878 * The locking appears insufficient to guarantee two parallel registers 3879 * will not get the same name. 3880 */ 3881 3882 int register_netdevice(struct net_device *dev) 3883 { 3884 struct hlist_head *head; 3885 struct hlist_node *p; 3886 int ret; 3887 struct net *net; 3888 3889 BUG_ON(dev_boot_phase); 3890 ASSERT_RTNL(); 3891 3892 might_sleep(); 3893 3894 /* When net_device's are persistent, this will be fatal. */ 3895 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 3896 BUG_ON(!dev_net(dev)); 3897 net = dev_net(dev); 3898 3899 spin_lock_init(&dev->addr_list_lock); 3900 netdev_init_queue_locks(dev); 3901 3902 dev->iflink = -1; 3903 3904 /* Init, if this function is available */ 3905 if (dev->init) { 3906 ret = dev->init(dev); 3907 if (ret) { 3908 if (ret > 0) 3909 ret = -EIO; 3910 goto out; 3911 } 3912 } 3913 3914 if (!dev_valid_name(dev->name)) { 3915 ret = -EINVAL; 3916 goto err_uninit; 3917 } 3918 3919 dev->ifindex = dev_new_index(net); 3920 if (dev->iflink == -1) 3921 dev->iflink = dev->ifindex; 3922 3923 /* Check for existence of name */ 3924 head = dev_name_hash(net, dev->name); 3925 hlist_for_each(p, head) { 3926 struct net_device *d 3927 = hlist_entry(p, struct net_device, name_hlist); 3928 if (!strncmp(d->name, dev->name, IFNAMSIZ)) { 3929 ret = -EEXIST; 3930 goto err_uninit; 3931 } 3932 } 3933 3934 /* Fix illegal checksum combinations */ 3935 if ((dev->features & NETIF_F_HW_CSUM) && 3936 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 3937 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", 3938 dev->name); 3939 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 3940 } 3941 3942 if ((dev->features & NETIF_F_NO_CSUM) && 3943 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 3944 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", 3945 dev->name); 3946 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); 3947 } 3948 3949 3950 /* Fix illegal SG+CSUM combinations. */ 3951 if ((dev->features & NETIF_F_SG) && 3952 !(dev->features & NETIF_F_ALL_CSUM)) { 3953 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n", 3954 dev->name); 3955 dev->features &= ~NETIF_F_SG; 3956 } 3957 3958 /* TSO requires that SG is present as well. */ 3959 if ((dev->features & NETIF_F_TSO) && 3960 !(dev->features & NETIF_F_SG)) { 3961 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n", 3962 dev->name); 3963 dev->features &= ~NETIF_F_TSO; 3964 } 3965 if (dev->features & NETIF_F_UFO) { 3966 if (!(dev->features & NETIF_F_HW_CSUM)) { 3967 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " 3968 "NETIF_F_HW_CSUM feature.\n", 3969 dev->name); 3970 dev->features &= ~NETIF_F_UFO; 3971 } 3972 if (!(dev->features & NETIF_F_SG)) { 3973 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " 3974 "NETIF_F_SG feature.\n", 3975 dev->name); 3976 dev->features &= ~NETIF_F_UFO; 3977 } 3978 } 3979 3980 netdev_initialize_kobject(dev); 3981 ret = netdev_register_kobject(dev); 3982 if (ret) 3983 goto err_uninit; 3984 dev->reg_state = NETREG_REGISTERED; 3985 3986 /* 3987 * Default initial state at registry is that the 3988 * device is present. 3989 */ 3990 3991 set_bit(__LINK_STATE_PRESENT, &dev->state); 3992 3993 dev_init_scheduler(dev); 3994 dev_hold(dev); 3995 list_netdevice(dev); 3996 3997 /* Notify protocols, that a new device appeared. */ 3998 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 3999 ret = notifier_to_errno(ret); 4000 if (ret) { 4001 rollback_registered(dev); 4002 dev->reg_state = NETREG_UNREGISTERED; 4003 } 4004 4005 out: 4006 return ret; 4007 4008 err_uninit: 4009 if (dev->uninit) 4010 dev->uninit(dev); 4011 goto out; 4012 } 4013 4014 /** 4015 * register_netdev - register a network device 4016 * @dev: device to register 4017 * 4018 * Take a completed network device structure and add it to the kernel 4019 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 4020 * chain. 0 is returned on success. A negative errno code is returned 4021 * on a failure to set up the device, or if the name is a duplicate. 4022 * 4023 * This is a wrapper around register_netdevice that takes the rtnl semaphore 4024 * and expands the device name if you passed a format string to 4025 * alloc_netdev. 4026 */ 4027 int register_netdev(struct net_device *dev) 4028 { 4029 int err; 4030 4031 rtnl_lock(); 4032 4033 /* 4034 * If the name is a format string the caller wants us to do a 4035 * name allocation. 4036 */ 4037 if (strchr(dev->name, '%')) { 4038 err = dev_alloc_name(dev, dev->name); 4039 if (err < 0) 4040 goto out; 4041 } 4042 4043 err = register_netdevice(dev); 4044 out: 4045 rtnl_unlock(); 4046 return err; 4047 } 4048 EXPORT_SYMBOL(register_netdev); 4049 4050 /* 4051 * netdev_wait_allrefs - wait until all references are gone. 4052 * 4053 * This is called when unregistering network devices. 4054 * 4055 * Any protocol or device that holds a reference should register 4056 * for netdevice notification, and cleanup and put back the 4057 * reference if they receive an UNREGISTER event. 4058 * We can get stuck here if buggy protocols don't correctly 4059 * call dev_put. 4060 */ 4061 static void netdev_wait_allrefs(struct net_device *dev) 4062 { 4063 unsigned long rebroadcast_time, warning_time; 4064 4065 rebroadcast_time = warning_time = jiffies; 4066 while (atomic_read(&dev->refcnt) != 0) { 4067 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 4068 rtnl_lock(); 4069 4070 /* Rebroadcast unregister notification */ 4071 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4072 4073 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 4074 &dev->state)) { 4075 /* We must not have linkwatch events 4076 * pending on unregister. If this 4077 * happens, we simply run the queue 4078 * unscheduled, resulting in a noop 4079 * for this device. 4080 */ 4081 linkwatch_run_queue(); 4082 } 4083 4084 __rtnl_unlock(); 4085 4086 rebroadcast_time = jiffies; 4087 } 4088 4089 msleep(250); 4090 4091 if (time_after(jiffies, warning_time + 10 * HZ)) { 4092 printk(KERN_EMERG "unregister_netdevice: " 4093 "waiting for %s to become free. Usage " 4094 "count = %d\n", 4095 dev->name, atomic_read(&dev->refcnt)); 4096 warning_time = jiffies; 4097 } 4098 } 4099 } 4100 4101 /* The sequence is: 4102 * 4103 * rtnl_lock(); 4104 * ... 4105 * register_netdevice(x1); 4106 * register_netdevice(x2); 4107 * ... 4108 * unregister_netdevice(y1); 4109 * unregister_netdevice(y2); 4110 * ... 4111 * rtnl_unlock(); 4112 * free_netdev(y1); 4113 * free_netdev(y2); 4114 * 4115 * We are invoked by rtnl_unlock() after it drops the semaphore. 4116 * This allows us to deal with problems: 4117 * 1) We can delete sysfs objects which invoke hotplug 4118 * without deadlocking with linkwatch via keventd. 4119 * 2) Since we run with the RTNL semaphore not held, we can sleep 4120 * safely in order to wait for the netdev refcnt to drop to zero. 4121 */ 4122 static DEFINE_MUTEX(net_todo_run_mutex); 4123 void netdev_run_todo(void) 4124 { 4125 struct list_head list; 4126 4127 /* Need to guard against multiple cpu's getting out of order. */ 4128 mutex_lock(&net_todo_run_mutex); 4129 4130 /* Not safe to do outside the semaphore. We must not return 4131 * until all unregister events invoked by the local processor 4132 * have been completed (either by this todo run, or one on 4133 * another cpu). 4134 */ 4135 if (list_empty(&net_todo_list)) 4136 goto out; 4137 4138 /* Snapshot list, allow later requests */ 4139 spin_lock(&net_todo_list_lock); 4140 list_replace_init(&net_todo_list, &list); 4141 spin_unlock(&net_todo_list_lock); 4142 4143 while (!list_empty(&list)) { 4144 struct net_device *dev 4145 = list_entry(list.next, struct net_device, todo_list); 4146 list_del(&dev->todo_list); 4147 4148 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 4149 printk(KERN_ERR "network todo '%s' but state %d\n", 4150 dev->name, dev->reg_state); 4151 dump_stack(); 4152 continue; 4153 } 4154 4155 dev->reg_state = NETREG_UNREGISTERED; 4156 4157 netdev_wait_allrefs(dev); 4158 4159 /* paranoia */ 4160 BUG_ON(atomic_read(&dev->refcnt)); 4161 BUG_TRAP(!dev->ip_ptr); 4162 BUG_TRAP(!dev->ip6_ptr); 4163 BUG_TRAP(!dev->dn_ptr); 4164 4165 if (dev->destructor) 4166 dev->destructor(dev); 4167 4168 /* Free network device */ 4169 kobject_put(&dev->dev.kobj); 4170 } 4171 4172 out: 4173 mutex_unlock(&net_todo_run_mutex); 4174 } 4175 4176 static struct net_device_stats *internal_stats(struct net_device *dev) 4177 { 4178 return &dev->stats; 4179 } 4180 4181 static void netdev_init_one_queue(struct net_device *dev, 4182 struct netdev_queue *queue, 4183 void *_unused) 4184 { 4185 queue->dev = dev; 4186 } 4187 4188 static void netdev_init_queues(struct net_device *dev) 4189 { 4190 netdev_init_one_queue(dev, &dev->rx_queue, NULL); 4191 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 4192 } 4193 4194 /** 4195 * alloc_netdev_mq - allocate network device 4196 * @sizeof_priv: size of private data to allocate space for 4197 * @name: device name format string 4198 * @setup: callback to initialize device 4199 * @queue_count: the number of subqueues to allocate 4200 * 4201 * Allocates a struct net_device with private data area for driver use 4202 * and performs basic initialization. Also allocates subquue structs 4203 * for each queue on the device at the end of the netdevice. 4204 */ 4205 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, 4206 void (*setup)(struct net_device *), unsigned int queue_count) 4207 { 4208 struct netdev_queue *tx; 4209 struct net_device *dev; 4210 int alloc_size; 4211 void *p; 4212 4213 BUG_ON(strlen(name) >= sizeof(dev->name)); 4214 4215 alloc_size = sizeof(struct net_device); 4216 if (sizeof_priv) { 4217 /* ensure 32-byte alignment of private area */ 4218 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; 4219 alloc_size += sizeof_priv; 4220 } 4221 /* ensure 32-byte alignment of whole construct */ 4222 alloc_size += NETDEV_ALIGN_CONST; 4223 4224 p = kzalloc(alloc_size, GFP_KERNEL); 4225 if (!p) { 4226 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); 4227 return NULL; 4228 } 4229 4230 tx = kzalloc(sizeof(struct netdev_queue) * queue_count, GFP_KERNEL); 4231 if (!tx) { 4232 printk(KERN_ERR "alloc_netdev: Unable to allocate " 4233 "tx qdiscs.\n"); 4234 kfree(p); 4235 return NULL; 4236 } 4237 4238 dev = (struct net_device *) 4239 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); 4240 dev->padded = (char *)dev - (char *)p; 4241 dev_net_set(dev, &init_net); 4242 4243 dev->_tx = tx; 4244 dev->num_tx_queues = queue_count; 4245 dev->real_num_tx_queues = queue_count; 4246 4247 if (sizeof_priv) { 4248 dev->priv = ((char *)dev + 4249 ((sizeof(struct net_device) + NETDEV_ALIGN_CONST) 4250 & ~NETDEV_ALIGN_CONST)); 4251 } 4252 4253 dev->gso_max_size = GSO_MAX_SIZE; 4254 4255 netdev_init_queues(dev); 4256 4257 dev->get_stats = internal_stats; 4258 netpoll_netdev_init(dev); 4259 setup(dev); 4260 strcpy(dev->name, name); 4261 return dev; 4262 } 4263 EXPORT_SYMBOL(alloc_netdev_mq); 4264 4265 /** 4266 * free_netdev - free network device 4267 * @dev: device 4268 * 4269 * This function does the last stage of destroying an allocated device 4270 * interface. The reference to the device object is released. 4271 * If this is the last reference then it will be freed. 4272 */ 4273 void free_netdev(struct net_device *dev) 4274 { 4275 release_net(dev_net(dev)); 4276 4277 kfree(dev->_tx); 4278 4279 /* Compatibility with error handling in drivers */ 4280 if (dev->reg_state == NETREG_UNINITIALIZED) { 4281 kfree((char *)dev - dev->padded); 4282 return; 4283 } 4284 4285 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 4286 dev->reg_state = NETREG_RELEASED; 4287 4288 /* will free via device release */ 4289 put_device(&dev->dev); 4290 } 4291 4292 /* Synchronize with packet receive processing. */ 4293 void synchronize_net(void) 4294 { 4295 might_sleep(); 4296 synchronize_rcu(); 4297 } 4298 4299 /** 4300 * unregister_netdevice - remove device from the kernel 4301 * @dev: device 4302 * 4303 * This function shuts down a device interface and removes it 4304 * from the kernel tables. 4305 * 4306 * Callers must hold the rtnl semaphore. You may want 4307 * unregister_netdev() instead of this. 4308 */ 4309 4310 void unregister_netdevice(struct net_device *dev) 4311 { 4312 ASSERT_RTNL(); 4313 4314 rollback_registered(dev); 4315 /* Finish processing unregister after unlock */ 4316 net_set_todo(dev); 4317 } 4318 4319 /** 4320 * unregister_netdev - remove device from the kernel 4321 * @dev: device 4322 * 4323 * This function shuts down a device interface and removes it 4324 * from the kernel tables. 4325 * 4326 * This is just a wrapper for unregister_netdevice that takes 4327 * the rtnl semaphore. In general you want to use this and not 4328 * unregister_netdevice. 4329 */ 4330 void unregister_netdev(struct net_device *dev) 4331 { 4332 rtnl_lock(); 4333 unregister_netdevice(dev); 4334 rtnl_unlock(); 4335 } 4336 4337 EXPORT_SYMBOL(unregister_netdev); 4338 4339 /** 4340 * dev_change_net_namespace - move device to different nethost namespace 4341 * @dev: device 4342 * @net: network namespace 4343 * @pat: If not NULL name pattern to try if the current device name 4344 * is already taken in the destination network namespace. 4345 * 4346 * This function shuts down a device interface and moves it 4347 * to a new network namespace. On success 0 is returned, on 4348 * a failure a netagive errno code is returned. 4349 * 4350 * Callers must hold the rtnl semaphore. 4351 */ 4352 4353 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 4354 { 4355 char buf[IFNAMSIZ]; 4356 const char *destname; 4357 int err; 4358 4359 ASSERT_RTNL(); 4360 4361 /* Don't allow namespace local devices to be moved. */ 4362 err = -EINVAL; 4363 if (dev->features & NETIF_F_NETNS_LOCAL) 4364 goto out; 4365 4366 /* Ensure the device has been registrered */ 4367 err = -EINVAL; 4368 if (dev->reg_state != NETREG_REGISTERED) 4369 goto out; 4370 4371 /* Get out if there is nothing todo */ 4372 err = 0; 4373 if (net_eq(dev_net(dev), net)) 4374 goto out; 4375 4376 /* Pick the destination device name, and ensure 4377 * we can use it in the destination network namespace. 4378 */ 4379 err = -EEXIST; 4380 destname = dev->name; 4381 if (__dev_get_by_name(net, destname)) { 4382 /* We get here if we can't use the current device name */ 4383 if (!pat) 4384 goto out; 4385 if (!dev_valid_name(pat)) 4386 goto out; 4387 if (strchr(pat, '%')) { 4388 if (__dev_alloc_name(net, pat, buf) < 0) 4389 goto out; 4390 destname = buf; 4391 } else 4392 destname = pat; 4393 if (__dev_get_by_name(net, destname)) 4394 goto out; 4395 } 4396 4397 /* 4398 * And now a mini version of register_netdevice unregister_netdevice. 4399 */ 4400 4401 /* If device is running close it first. */ 4402 dev_close(dev); 4403 4404 /* And unlink it from device chain */ 4405 err = -ENODEV; 4406 unlist_netdevice(dev); 4407 4408 synchronize_net(); 4409 4410 /* Shutdown queueing discipline. */ 4411 dev_shutdown(dev); 4412 4413 /* Notify protocols, that we are about to destroy 4414 this device. They should clean all the things. 4415 */ 4416 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4417 4418 /* 4419 * Flush the unicast and multicast chains 4420 */ 4421 dev_addr_discard(dev); 4422 4423 /* Actually switch the network namespace */ 4424 dev_net_set(dev, net); 4425 4426 /* Assign the new device name */ 4427 if (destname != dev->name) 4428 strcpy(dev->name, destname); 4429 4430 /* If there is an ifindex conflict assign a new one */ 4431 if (__dev_get_by_index(net, dev->ifindex)) { 4432 int iflink = (dev->iflink == dev->ifindex); 4433 dev->ifindex = dev_new_index(net); 4434 if (iflink) 4435 dev->iflink = dev->ifindex; 4436 } 4437 4438 /* Fixup kobjects */ 4439 netdev_unregister_kobject(dev); 4440 err = netdev_register_kobject(dev); 4441 WARN_ON(err); 4442 4443 /* Add the device back in the hashes */ 4444 list_netdevice(dev); 4445 4446 /* Notify protocols, that a new device appeared. */ 4447 call_netdevice_notifiers(NETDEV_REGISTER, dev); 4448 4449 synchronize_net(); 4450 err = 0; 4451 out: 4452 return err; 4453 } 4454 4455 static int dev_cpu_callback(struct notifier_block *nfb, 4456 unsigned long action, 4457 void *ocpu) 4458 { 4459 struct sk_buff **list_skb; 4460 struct Qdisc **list_net; 4461 struct sk_buff *skb; 4462 unsigned int cpu, oldcpu = (unsigned long)ocpu; 4463 struct softnet_data *sd, *oldsd; 4464 4465 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 4466 return NOTIFY_OK; 4467 4468 local_irq_disable(); 4469 cpu = smp_processor_id(); 4470 sd = &per_cpu(softnet_data, cpu); 4471 oldsd = &per_cpu(softnet_data, oldcpu); 4472 4473 /* Find end of our completion_queue. */ 4474 list_skb = &sd->completion_queue; 4475 while (*list_skb) 4476 list_skb = &(*list_skb)->next; 4477 /* Append completion queue from offline CPU. */ 4478 *list_skb = oldsd->completion_queue; 4479 oldsd->completion_queue = NULL; 4480 4481 /* Find end of our output_queue. */ 4482 list_net = &sd->output_queue; 4483 while (*list_net) 4484 list_net = &(*list_net)->next_sched; 4485 /* Append output queue from offline CPU. */ 4486 *list_net = oldsd->output_queue; 4487 oldsd->output_queue = NULL; 4488 4489 raise_softirq_irqoff(NET_TX_SOFTIRQ); 4490 local_irq_enable(); 4491 4492 /* Process offline CPU's input_pkt_queue */ 4493 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) 4494 netif_rx(skb); 4495 4496 return NOTIFY_OK; 4497 } 4498 4499 #ifdef CONFIG_NET_DMA 4500 /** 4501 * net_dma_rebalance - try to maintain one DMA channel per CPU 4502 * @net_dma: DMA client and associated data (lock, channels, channel_mask) 4503 * 4504 * This is called when the number of channels allocated to the net_dma client 4505 * changes. The net_dma client tries to have one DMA channel per CPU. 4506 */ 4507 4508 static void net_dma_rebalance(struct net_dma *net_dma) 4509 { 4510 unsigned int cpu, i, n, chan_idx; 4511 struct dma_chan *chan; 4512 4513 if (cpus_empty(net_dma->channel_mask)) { 4514 for_each_online_cpu(cpu) 4515 rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL); 4516 return; 4517 } 4518 4519 i = 0; 4520 cpu = first_cpu(cpu_online_map); 4521 4522 for_each_cpu_mask(chan_idx, net_dma->channel_mask) { 4523 chan = net_dma->channels[chan_idx]; 4524 4525 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask)) 4526 + (i < (num_online_cpus() % 4527 cpus_weight(net_dma->channel_mask)) ? 1 : 0)); 4528 4529 while(n) { 4530 per_cpu(softnet_data, cpu).net_dma = chan; 4531 cpu = next_cpu(cpu, cpu_online_map); 4532 n--; 4533 } 4534 i++; 4535 } 4536 } 4537 4538 /** 4539 * netdev_dma_event - event callback for the net_dma_client 4540 * @client: should always be net_dma_client 4541 * @chan: DMA channel for the event 4542 * @state: DMA state to be handled 4543 */ 4544 static enum dma_state_client 4545 netdev_dma_event(struct dma_client *client, struct dma_chan *chan, 4546 enum dma_state state) 4547 { 4548 int i, found = 0, pos = -1; 4549 struct net_dma *net_dma = 4550 container_of(client, struct net_dma, client); 4551 enum dma_state_client ack = DMA_DUP; /* default: take no action */ 4552 4553 spin_lock(&net_dma->lock); 4554 switch (state) { 4555 case DMA_RESOURCE_AVAILABLE: 4556 for (i = 0; i < nr_cpu_ids; i++) 4557 if (net_dma->channels[i] == chan) { 4558 found = 1; 4559 break; 4560 } else if (net_dma->channels[i] == NULL && pos < 0) 4561 pos = i; 4562 4563 if (!found && pos >= 0) { 4564 ack = DMA_ACK; 4565 net_dma->channels[pos] = chan; 4566 cpu_set(pos, net_dma->channel_mask); 4567 net_dma_rebalance(net_dma); 4568 } 4569 break; 4570 case DMA_RESOURCE_REMOVED: 4571 for (i = 0; i < nr_cpu_ids; i++) 4572 if (net_dma->channels[i] == chan) { 4573 found = 1; 4574 pos = i; 4575 break; 4576 } 4577 4578 if (found) { 4579 ack = DMA_ACK; 4580 cpu_clear(pos, net_dma->channel_mask); 4581 net_dma->channels[i] = NULL; 4582 net_dma_rebalance(net_dma); 4583 } 4584 break; 4585 default: 4586 break; 4587 } 4588 spin_unlock(&net_dma->lock); 4589 4590 return ack; 4591 } 4592 4593 /** 4594 * netdev_dma_regiser - register the networking subsystem as a DMA client 4595 */ 4596 static int __init netdev_dma_register(void) 4597 { 4598 net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma), 4599 GFP_KERNEL); 4600 if (unlikely(!net_dma.channels)) { 4601 printk(KERN_NOTICE 4602 "netdev_dma: no memory for net_dma.channels\n"); 4603 return -ENOMEM; 4604 } 4605 spin_lock_init(&net_dma.lock); 4606 dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask); 4607 dma_async_client_register(&net_dma.client); 4608 dma_async_client_chan_request(&net_dma.client); 4609 return 0; 4610 } 4611 4612 #else 4613 static int __init netdev_dma_register(void) { return -ENODEV; } 4614 #endif /* CONFIG_NET_DMA */ 4615 4616 /** 4617 * netdev_compute_feature - compute conjunction of two feature sets 4618 * @all: first feature set 4619 * @one: second feature set 4620 * 4621 * Computes a new feature set after adding a device with feature set 4622 * @one to the master device with current feature set @all. Returns 4623 * the new feature set. 4624 */ 4625 int netdev_compute_features(unsigned long all, unsigned long one) 4626 { 4627 /* if device needs checksumming, downgrade to hw checksumming */ 4628 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) 4629 all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM; 4630 4631 /* if device can't do all checksum, downgrade to ipv4/ipv6 */ 4632 if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM)) 4633 all ^= NETIF_F_HW_CSUM 4634 | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; 4635 4636 if (one & NETIF_F_GSO) 4637 one |= NETIF_F_GSO_SOFTWARE; 4638 one |= NETIF_F_GSO; 4639 4640 /* If even one device supports robust GSO, enable it for all. */ 4641 if (one & NETIF_F_GSO_ROBUST) 4642 all |= NETIF_F_GSO_ROBUST; 4643 4644 all &= one | NETIF_F_LLTX; 4645 4646 if (!(all & NETIF_F_ALL_CSUM)) 4647 all &= ~NETIF_F_SG; 4648 if (!(all & NETIF_F_SG)) 4649 all &= ~NETIF_F_GSO_MASK; 4650 4651 return all; 4652 } 4653 EXPORT_SYMBOL(netdev_compute_features); 4654 4655 static struct hlist_head *netdev_create_hash(void) 4656 { 4657 int i; 4658 struct hlist_head *hash; 4659 4660 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 4661 if (hash != NULL) 4662 for (i = 0; i < NETDEV_HASHENTRIES; i++) 4663 INIT_HLIST_HEAD(&hash[i]); 4664 4665 return hash; 4666 } 4667 4668 /* Initialize per network namespace state */ 4669 static int __net_init netdev_init(struct net *net) 4670 { 4671 INIT_LIST_HEAD(&net->dev_base_head); 4672 4673 net->dev_name_head = netdev_create_hash(); 4674 if (net->dev_name_head == NULL) 4675 goto err_name; 4676 4677 net->dev_index_head = netdev_create_hash(); 4678 if (net->dev_index_head == NULL) 4679 goto err_idx; 4680 4681 return 0; 4682 4683 err_idx: 4684 kfree(net->dev_name_head); 4685 err_name: 4686 return -ENOMEM; 4687 } 4688 4689 static void __net_exit netdev_exit(struct net *net) 4690 { 4691 kfree(net->dev_name_head); 4692 kfree(net->dev_index_head); 4693 } 4694 4695 static struct pernet_operations __net_initdata netdev_net_ops = { 4696 .init = netdev_init, 4697 .exit = netdev_exit, 4698 }; 4699 4700 static void __net_exit default_device_exit(struct net *net) 4701 { 4702 struct net_device *dev, *next; 4703 /* 4704 * Push all migratable of the network devices back to the 4705 * initial network namespace 4706 */ 4707 rtnl_lock(); 4708 for_each_netdev_safe(net, dev, next) { 4709 int err; 4710 char fb_name[IFNAMSIZ]; 4711 4712 /* Ignore unmoveable devices (i.e. loopback) */ 4713 if (dev->features & NETIF_F_NETNS_LOCAL) 4714 continue; 4715 4716 /* Push remaing network devices to init_net */ 4717 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 4718 err = dev_change_net_namespace(dev, &init_net, fb_name); 4719 if (err) { 4720 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", 4721 __func__, dev->name, err); 4722 BUG(); 4723 } 4724 } 4725 rtnl_unlock(); 4726 } 4727 4728 static struct pernet_operations __net_initdata default_device_ops = { 4729 .exit = default_device_exit, 4730 }; 4731 4732 /* 4733 * Initialize the DEV module. At boot time this walks the device list and 4734 * unhooks any devices that fail to initialise (normally hardware not 4735 * present) and leaves us with a valid list of present and active devices. 4736 * 4737 */ 4738 4739 /* 4740 * This is called single threaded during boot, so no need 4741 * to take the rtnl semaphore. 4742 */ 4743 static int __init net_dev_init(void) 4744 { 4745 int i, rc = -ENOMEM; 4746 4747 BUG_ON(!dev_boot_phase); 4748 4749 if (dev_proc_init()) 4750 goto out; 4751 4752 if (netdev_kobject_init()) 4753 goto out; 4754 4755 INIT_LIST_HEAD(&ptype_all); 4756 for (i = 0; i < PTYPE_HASH_SIZE; i++) 4757 INIT_LIST_HEAD(&ptype_base[i]); 4758 4759 if (register_pernet_subsys(&netdev_net_ops)) 4760 goto out; 4761 4762 if (register_pernet_device(&default_device_ops)) 4763 goto out; 4764 4765 /* 4766 * Initialise the packet receive queues. 4767 */ 4768 4769 for_each_possible_cpu(i) { 4770 struct softnet_data *queue; 4771 4772 queue = &per_cpu(softnet_data, i); 4773 skb_queue_head_init(&queue->input_pkt_queue); 4774 queue->completion_queue = NULL; 4775 INIT_LIST_HEAD(&queue->poll_list); 4776 4777 queue->backlog.poll = process_backlog; 4778 queue->backlog.weight = weight_p; 4779 } 4780 4781 netdev_dma_register(); 4782 4783 dev_boot_phase = 0; 4784 4785 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 4786 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 4787 4788 hotcpu_notifier(dev_cpu_callback, 0); 4789 dst_init(); 4790 dev_mcast_init(); 4791 rc = 0; 4792 out: 4793 return rc; 4794 } 4795 4796 subsys_initcall(net_dev_init); 4797 4798 EXPORT_SYMBOL(__dev_get_by_index); 4799 EXPORT_SYMBOL(__dev_get_by_name); 4800 EXPORT_SYMBOL(__dev_remove_pack); 4801 EXPORT_SYMBOL(dev_valid_name); 4802 EXPORT_SYMBOL(dev_add_pack); 4803 EXPORT_SYMBOL(dev_alloc_name); 4804 EXPORT_SYMBOL(dev_close); 4805 EXPORT_SYMBOL(dev_get_by_flags); 4806 EXPORT_SYMBOL(dev_get_by_index); 4807 EXPORT_SYMBOL(dev_get_by_name); 4808 EXPORT_SYMBOL(dev_open); 4809 EXPORT_SYMBOL(dev_queue_xmit); 4810 EXPORT_SYMBOL(dev_remove_pack); 4811 EXPORT_SYMBOL(dev_set_allmulti); 4812 EXPORT_SYMBOL(dev_set_promiscuity); 4813 EXPORT_SYMBOL(dev_change_flags); 4814 EXPORT_SYMBOL(dev_set_mtu); 4815 EXPORT_SYMBOL(dev_set_mac_address); 4816 EXPORT_SYMBOL(free_netdev); 4817 EXPORT_SYMBOL(netdev_boot_setup_check); 4818 EXPORT_SYMBOL(netdev_set_master); 4819 EXPORT_SYMBOL(netdev_state_change); 4820 EXPORT_SYMBOL(netif_receive_skb); 4821 EXPORT_SYMBOL(netif_rx); 4822 EXPORT_SYMBOL(register_gifconf); 4823 EXPORT_SYMBOL(register_netdevice); 4824 EXPORT_SYMBOL(register_netdevice_notifier); 4825 EXPORT_SYMBOL(skb_checksum_help); 4826 EXPORT_SYMBOL(synchronize_net); 4827 EXPORT_SYMBOL(unregister_netdevice); 4828 EXPORT_SYMBOL(unregister_netdevice_notifier); 4829 EXPORT_SYMBOL(net_enable_timestamp); 4830 EXPORT_SYMBOL(net_disable_timestamp); 4831 EXPORT_SYMBOL(dev_get_flags); 4832 4833 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) 4834 EXPORT_SYMBOL(br_handle_frame_hook); 4835 EXPORT_SYMBOL(br_fdb_get_hook); 4836 EXPORT_SYMBOL(br_fdb_put_hook); 4837 #endif 4838 4839 #ifdef CONFIG_KMOD 4840 EXPORT_SYMBOL(dev_load); 4841 #endif 4842 4843 EXPORT_PER_CPU_SYMBOL(softnet_data); 4844