xref: /linux/net/core/dev.c (revision 6a3335b43342b42dd6c69b4bbbde15d622cb49ca)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/ethtool.h>
94 #include <linux/notifier.h>
95 #include <linux/skbuff.h>
96 #include <net/net_namespace.h>
97 #include <net/sock.h>
98 #include <linux/rtnetlink.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/stat.h>
102 #include <linux/if_bridge.h>
103 #include <linux/if_macvlan.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 
130 #include "net-sysfs.h"
131 
132 /* Instead of increasing this, you should create a hash table. */
133 #define MAX_GRO_SKBS 8
134 
135 /* This should be increased if a protocol with a bigger head is added. */
136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
137 
138 /*
139  *	The list of packet types we will receive (as opposed to discard)
140  *	and the routines to invoke.
141  *
142  *	Why 16. Because with 16 the only overlap we get on a hash of the
143  *	low nibble of the protocol value is RARP/SNAP/X.25.
144  *
145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
146  *             sure which should go first, but I bet it won't make much
147  *             difference if we are running VLANs.  The good news is that
148  *             this protocol won't be in the list unless compiled in, so
149  *             the average user (w/out VLANs) will not be adversely affected.
150  *             --BLG
151  *
152  *		0800	IP
153  *		8100    802.1Q VLAN
154  *		0001	802.3
155  *		0002	AX.25
156  *		0004	802.2
157  *		8035	RARP
158  *		0005	SNAP
159  *		0805	X.25
160  *		0806	ARP
161  *		8137	IPX
162  *		0009	Localtalk
163  *		86DD	IPv6
164  */
165 
166 #define PTYPE_HASH_SIZE	(16)
167 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
168 
169 static DEFINE_SPINLOCK(ptype_lock);
170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
171 static struct list_head ptype_all __read_mostly;	/* Taps */
172 
173 /*
174  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175  * semaphore.
176  *
177  * Pure readers hold dev_base_lock for reading.
178  *
179  * Writers must hold the rtnl semaphore while they loop through the
180  * dev_base_head list, and hold dev_base_lock for writing when they do the
181  * actual updates.  This allows pure readers to access the list even
182  * while a writer is preparing to update it.
183  *
184  * To put it another way, dev_base_lock is held for writing only to
185  * protect against pure readers; the rtnl semaphore provides the
186  * protection against other writers.
187  *
188  * See, for example usages, register_netdevice() and
189  * unregister_netdevice(), which must be called with the rtnl
190  * semaphore held.
191  */
192 DEFINE_RWLOCK(dev_base_lock);
193 
194 EXPORT_SYMBOL(dev_base_lock);
195 
196 #define NETDEV_HASHBITS	8
197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
198 
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
208 }
209 
210 /* Device list insertion */
211 static int list_netdevice(struct net_device *dev)
212 {
213 	struct net *net = dev_net(dev);
214 
215 	ASSERT_RTNL();
216 
217 	write_lock_bh(&dev_base_lock);
218 	list_add_tail(&dev->dev_list, &net->dev_base_head);
219 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
220 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
221 	write_unlock_bh(&dev_base_lock);
222 	return 0;
223 }
224 
225 /* Device list removal */
226 static void unlist_netdevice(struct net_device *dev)
227 {
228 	ASSERT_RTNL();
229 
230 	/* Unlink dev from the device chain */
231 	write_lock_bh(&dev_base_lock);
232 	list_del(&dev->dev_list);
233 	hlist_del(&dev->name_hlist);
234 	hlist_del(&dev->index_hlist);
235 	write_unlock_bh(&dev_base_lock);
236 }
237 
238 /*
239  *	Our notifier list
240  */
241 
242 static RAW_NOTIFIER_HEAD(netdev_chain);
243 
244 /*
245  *	Device drivers call our routines to queue packets here. We empty the
246  *	queue in the local softnet handler.
247  */
248 
249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
250 
251 #ifdef CONFIG_LOCKDEP
252 /*
253  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
254  * according to dev->type
255  */
256 static const unsigned short netdev_lock_type[] =
257 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
258 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
259 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
260 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
261 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
262 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
263 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
264 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
265 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
266 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
267 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
268 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
269 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
270 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
271 	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
272 
273 static const char *netdev_lock_name[] =
274 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
275 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
276 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
277 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
278 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
279 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
280 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
281 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
282 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
283 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
284 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
285 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
286 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
287 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
288 	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
289 
290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
292 
293 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
294 {
295 	int i;
296 
297 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
298 		if (netdev_lock_type[i] == dev_type)
299 			return i;
300 	/* the last key is used by default */
301 	return ARRAY_SIZE(netdev_lock_type) - 1;
302 }
303 
304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
305 						 unsigned short dev_type)
306 {
307 	int i;
308 
309 	i = netdev_lock_pos(dev_type);
310 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
311 				   netdev_lock_name[i]);
312 }
313 
314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
315 {
316 	int i;
317 
318 	i = netdev_lock_pos(dev->type);
319 	lockdep_set_class_and_name(&dev->addr_list_lock,
320 				   &netdev_addr_lock_key[i],
321 				   netdev_lock_name[i]);
322 }
323 #else
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 }
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330 }
331 #endif
332 
333 /*******************************************************************************
334 
335 		Protocol management and registration routines
336 
337 *******************************************************************************/
338 
339 /*
340  *	Add a protocol ID to the list. Now that the input handler is
341  *	smarter we can dispense with all the messy stuff that used to be
342  *	here.
343  *
344  *	BEWARE!!! Protocol handlers, mangling input packets,
345  *	MUST BE last in hash buckets and checking protocol handlers
346  *	MUST start from promiscuous ptype_all chain in net_bh.
347  *	It is true now, do not change it.
348  *	Explanation follows: if protocol handler, mangling packet, will
349  *	be the first on list, it is not able to sense, that packet
350  *	is cloned and should be copied-on-write, so that it will
351  *	change it and subsequent readers will get broken packet.
352  *							--ANK (980803)
353  */
354 
355 /**
356  *	dev_add_pack - add packet handler
357  *	@pt: packet type declaration
358  *
359  *	Add a protocol handler to the networking stack. The passed &packet_type
360  *	is linked into kernel lists and may not be freed until it has been
361  *	removed from the kernel lists.
362  *
363  *	This call does not sleep therefore it can not
364  *	guarantee all CPU's that are in middle of receiving packets
365  *	will see the new packet type (until the next received packet).
366  */
367 
368 void dev_add_pack(struct packet_type *pt)
369 {
370 	int hash;
371 
372 	spin_lock_bh(&ptype_lock);
373 	if (pt->type == htons(ETH_P_ALL))
374 		list_add_rcu(&pt->list, &ptype_all);
375 	else {
376 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
377 		list_add_rcu(&pt->list, &ptype_base[hash]);
378 	}
379 	spin_unlock_bh(&ptype_lock);
380 }
381 
382 /**
383  *	__dev_remove_pack	 - remove packet handler
384  *	@pt: packet type declaration
385  *
386  *	Remove a protocol handler that was previously added to the kernel
387  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
388  *	from the kernel lists and can be freed or reused once this function
389  *	returns.
390  *
391  *      The packet type might still be in use by receivers
392  *	and must not be freed until after all the CPU's have gone
393  *	through a quiescent state.
394  */
395 void __dev_remove_pack(struct packet_type *pt)
396 {
397 	struct list_head *head;
398 	struct packet_type *pt1;
399 
400 	spin_lock_bh(&ptype_lock);
401 
402 	if (pt->type == htons(ETH_P_ALL))
403 		head = &ptype_all;
404 	else
405 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
406 
407 	list_for_each_entry(pt1, head, list) {
408 		if (pt == pt1) {
409 			list_del_rcu(&pt->list);
410 			goto out;
411 		}
412 	}
413 
414 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
415 out:
416 	spin_unlock_bh(&ptype_lock);
417 }
418 /**
419  *	dev_remove_pack	 - remove packet handler
420  *	@pt: packet type declaration
421  *
422  *	Remove a protocol handler that was previously added to the kernel
423  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
424  *	from the kernel lists and can be freed or reused once this function
425  *	returns.
426  *
427  *	This call sleeps to guarantee that no CPU is looking at the packet
428  *	type after return.
429  */
430 void dev_remove_pack(struct packet_type *pt)
431 {
432 	__dev_remove_pack(pt);
433 
434 	synchronize_net();
435 }
436 
437 /******************************************************************************
438 
439 		      Device Boot-time Settings Routines
440 
441 *******************************************************************************/
442 
443 /* Boot time configuration table */
444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
445 
446 /**
447  *	netdev_boot_setup_add	- add new setup entry
448  *	@name: name of the device
449  *	@map: configured settings for the device
450  *
451  *	Adds new setup entry to the dev_boot_setup list.  The function
452  *	returns 0 on error and 1 on success.  This is a generic routine to
453  *	all netdevices.
454  */
455 static int netdev_boot_setup_add(char *name, struct ifmap *map)
456 {
457 	struct netdev_boot_setup *s;
458 	int i;
459 
460 	s = dev_boot_setup;
461 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
462 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
463 			memset(s[i].name, 0, sizeof(s[i].name));
464 			strlcpy(s[i].name, name, IFNAMSIZ);
465 			memcpy(&s[i].map, map, sizeof(s[i].map));
466 			break;
467 		}
468 	}
469 
470 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
471 }
472 
473 /**
474  *	netdev_boot_setup_check	- check boot time settings
475  *	@dev: the netdevice
476  *
477  * 	Check boot time settings for the device.
478  *	The found settings are set for the device to be used
479  *	later in the device probing.
480  *	Returns 0 if no settings found, 1 if they are.
481  */
482 int netdev_boot_setup_check(struct net_device *dev)
483 {
484 	struct netdev_boot_setup *s = dev_boot_setup;
485 	int i;
486 
487 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
488 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
489 		    !strcmp(dev->name, s[i].name)) {
490 			dev->irq 	= s[i].map.irq;
491 			dev->base_addr 	= s[i].map.base_addr;
492 			dev->mem_start 	= s[i].map.mem_start;
493 			dev->mem_end 	= s[i].map.mem_end;
494 			return 1;
495 		}
496 	}
497 	return 0;
498 }
499 
500 
501 /**
502  *	netdev_boot_base	- get address from boot time settings
503  *	@prefix: prefix for network device
504  *	@unit: id for network device
505  *
506  * 	Check boot time settings for the base address of device.
507  *	The found settings are set for the device to be used
508  *	later in the device probing.
509  *	Returns 0 if no settings found.
510  */
511 unsigned long netdev_boot_base(const char *prefix, int unit)
512 {
513 	const struct netdev_boot_setup *s = dev_boot_setup;
514 	char name[IFNAMSIZ];
515 	int i;
516 
517 	sprintf(name, "%s%d", prefix, unit);
518 
519 	/*
520 	 * If device already registered then return base of 1
521 	 * to indicate not to probe for this interface
522 	 */
523 	if (__dev_get_by_name(&init_net, name))
524 		return 1;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
527 		if (!strcmp(name, s[i].name))
528 			return s[i].map.base_addr;
529 	return 0;
530 }
531 
532 /*
533  * Saves at boot time configured settings for any netdevice.
534  */
535 int __init netdev_boot_setup(char *str)
536 {
537 	int ints[5];
538 	struct ifmap map;
539 
540 	str = get_options(str, ARRAY_SIZE(ints), ints);
541 	if (!str || !*str)
542 		return 0;
543 
544 	/* Save settings */
545 	memset(&map, 0, sizeof(map));
546 	if (ints[0] > 0)
547 		map.irq = ints[1];
548 	if (ints[0] > 1)
549 		map.base_addr = ints[2];
550 	if (ints[0] > 2)
551 		map.mem_start = ints[3];
552 	if (ints[0] > 3)
553 		map.mem_end = ints[4];
554 
555 	/* Add new entry to the list */
556 	return netdev_boot_setup_add(str, &map);
557 }
558 
559 __setup("netdev=", netdev_boot_setup);
560 
561 /*******************************************************************************
562 
563 			    Device Interface Subroutines
564 
565 *******************************************************************************/
566 
567 /**
568  *	__dev_get_by_name	- find a device by its name
569  *	@net: the applicable net namespace
570  *	@name: name to find
571  *
572  *	Find an interface by name. Must be called under RTNL semaphore
573  *	or @dev_base_lock. If the name is found a pointer to the device
574  *	is returned. If the name is not found then %NULL is returned. The
575  *	reference counters are not incremented so the caller must be
576  *	careful with locks.
577  */
578 
579 struct net_device *__dev_get_by_name(struct net *net, const char *name)
580 {
581 	struct hlist_node *p;
582 
583 	hlist_for_each(p, dev_name_hash(net, name)) {
584 		struct net_device *dev
585 			= hlist_entry(p, struct net_device, name_hlist);
586 		if (!strncmp(dev->name, name, IFNAMSIZ))
587 			return dev;
588 	}
589 	return NULL;
590 }
591 
592 /**
593  *	dev_get_by_name		- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. This can be called from any
598  *	context and does its own locking. The returned handle has
599  *	the usage count incremented and the caller must use dev_put() to
600  *	release it when it is no longer needed. %NULL is returned if no
601  *	matching device is found.
602  */
603 
604 struct net_device *dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct net_device *dev;
607 
608 	read_lock(&dev_base_lock);
609 	dev = __dev_get_by_name(net, name);
610 	if (dev)
611 		dev_hold(dev);
612 	read_unlock(&dev_base_lock);
613 	return dev;
614 }
615 
616 /**
617  *	__dev_get_by_index - find a device by its ifindex
618  *	@net: the applicable net namespace
619  *	@ifindex: index of device
620  *
621  *	Search for an interface by index. Returns %NULL if the device
622  *	is not found or a pointer to the device. The device has not
623  *	had its reference counter increased so the caller must be careful
624  *	about locking. The caller must hold either the RTNL semaphore
625  *	or @dev_base_lock.
626  */
627 
628 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
629 {
630 	struct hlist_node *p;
631 
632 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
633 		struct net_device *dev
634 			= hlist_entry(p, struct net_device, index_hlist);
635 		if (dev->ifindex == ifindex)
636 			return dev;
637 	}
638 	return NULL;
639 }
640 
641 
642 /**
643  *	dev_get_by_index - find a device by its ifindex
644  *	@net: the applicable net namespace
645  *	@ifindex: index of device
646  *
647  *	Search for an interface by index. Returns NULL if the device
648  *	is not found or a pointer to the device. The device returned has
649  *	had a reference added and the pointer is safe until the user calls
650  *	dev_put to indicate they have finished with it.
651  */
652 
653 struct net_device *dev_get_by_index(struct net *net, int ifindex)
654 {
655 	struct net_device *dev;
656 
657 	read_lock(&dev_base_lock);
658 	dev = __dev_get_by_index(net, ifindex);
659 	if (dev)
660 		dev_hold(dev);
661 	read_unlock(&dev_base_lock);
662 	return dev;
663 }
664 
665 /**
666  *	dev_getbyhwaddr - find a device by its hardware address
667  *	@net: the applicable net namespace
668  *	@type: media type of device
669  *	@ha: hardware address
670  *
671  *	Search for an interface by MAC address. Returns NULL if the device
672  *	is not found or a pointer to the device. The caller must hold the
673  *	rtnl semaphore. The returned device has not had its ref count increased
674  *	and the caller must therefore be careful about locking
675  *
676  *	BUGS:
677  *	If the API was consistent this would be __dev_get_by_hwaddr
678  */
679 
680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
681 {
682 	struct net_device *dev;
683 
684 	ASSERT_RTNL();
685 
686 	for_each_netdev(net, dev)
687 		if (dev->type == type &&
688 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
689 			return dev;
690 
691 	return NULL;
692 }
693 
694 EXPORT_SYMBOL(dev_getbyhwaddr);
695 
696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
697 {
698 	struct net_device *dev;
699 
700 	ASSERT_RTNL();
701 	for_each_netdev(net, dev)
702 		if (dev->type == type)
703 			return dev;
704 
705 	return NULL;
706 }
707 
708 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
709 
710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
711 {
712 	struct net_device *dev;
713 
714 	rtnl_lock();
715 	dev = __dev_getfirstbyhwtype(net, type);
716 	if (dev)
717 		dev_hold(dev);
718 	rtnl_unlock();
719 	return dev;
720 }
721 
722 EXPORT_SYMBOL(dev_getfirstbyhwtype);
723 
724 /**
725  *	dev_get_by_flags - find any device with given flags
726  *	@net: the applicable net namespace
727  *	@if_flags: IFF_* values
728  *	@mask: bitmask of bits in if_flags to check
729  *
730  *	Search for any interface with the given flags. Returns NULL if a device
731  *	is not found or a pointer to the device. The device returned has
732  *	had a reference added and the pointer is safe until the user calls
733  *	dev_put to indicate they have finished with it.
734  */
735 
736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
737 {
738 	struct net_device *dev, *ret;
739 
740 	ret = NULL;
741 	read_lock(&dev_base_lock);
742 	for_each_netdev(net, dev) {
743 		if (((dev->flags ^ if_flags) & mask) == 0) {
744 			dev_hold(dev);
745 			ret = dev;
746 			break;
747 		}
748 	}
749 	read_unlock(&dev_base_lock);
750 	return ret;
751 }
752 
753 /**
754  *	dev_valid_name - check if name is okay for network device
755  *	@name: name string
756  *
757  *	Network device names need to be valid file names to
758  *	to allow sysfs to work.  We also disallow any kind of
759  *	whitespace.
760  */
761 int dev_valid_name(const char *name)
762 {
763 	if (*name == '\0')
764 		return 0;
765 	if (strlen(name) >= IFNAMSIZ)
766 		return 0;
767 	if (!strcmp(name, ".") || !strcmp(name, ".."))
768 		return 0;
769 
770 	while (*name) {
771 		if (*name == '/' || isspace(*name))
772 			return 0;
773 		name++;
774 	}
775 	return 1;
776 }
777 
778 /**
779  *	__dev_alloc_name - allocate a name for a device
780  *	@net: network namespace to allocate the device name in
781  *	@name: name format string
782  *	@buf:  scratch buffer and result name string
783  *
784  *	Passed a format string - eg "lt%d" it will try and find a suitable
785  *	id. It scans list of devices to build up a free map, then chooses
786  *	the first empty slot. The caller must hold the dev_base or rtnl lock
787  *	while allocating the name and adding the device in order to avoid
788  *	duplicates.
789  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
790  *	Returns the number of the unit assigned or a negative errno code.
791  */
792 
793 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
794 {
795 	int i = 0;
796 	const char *p;
797 	const int max_netdevices = 8*PAGE_SIZE;
798 	unsigned long *inuse;
799 	struct net_device *d;
800 
801 	p = strnchr(name, IFNAMSIZ-1, '%');
802 	if (p) {
803 		/*
804 		 * Verify the string as this thing may have come from
805 		 * the user.  There must be either one "%d" and no other "%"
806 		 * characters.
807 		 */
808 		if (p[1] != 'd' || strchr(p + 2, '%'))
809 			return -EINVAL;
810 
811 		/* Use one page as a bit array of possible slots */
812 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
813 		if (!inuse)
814 			return -ENOMEM;
815 
816 		for_each_netdev(net, d) {
817 			if (!sscanf(d->name, name, &i))
818 				continue;
819 			if (i < 0 || i >= max_netdevices)
820 				continue;
821 
822 			/*  avoid cases where sscanf is not exact inverse of printf */
823 			snprintf(buf, IFNAMSIZ, name, i);
824 			if (!strncmp(buf, d->name, IFNAMSIZ))
825 				set_bit(i, inuse);
826 		}
827 
828 		i = find_first_zero_bit(inuse, max_netdevices);
829 		free_page((unsigned long) inuse);
830 	}
831 
832 	snprintf(buf, IFNAMSIZ, name, i);
833 	if (!__dev_get_by_name(net, buf))
834 		return i;
835 
836 	/* It is possible to run out of possible slots
837 	 * when the name is long and there isn't enough space left
838 	 * for the digits, or if all bits are used.
839 	 */
840 	return -ENFILE;
841 }
842 
843 /**
844  *	dev_alloc_name - allocate a name for a device
845  *	@dev: device
846  *	@name: name format string
847  *
848  *	Passed a format string - eg "lt%d" it will try and find a suitable
849  *	id. It scans list of devices to build up a free map, then chooses
850  *	the first empty slot. The caller must hold the dev_base or rtnl lock
851  *	while allocating the name and adding the device in order to avoid
852  *	duplicates.
853  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
854  *	Returns the number of the unit assigned or a negative errno code.
855  */
856 
857 int dev_alloc_name(struct net_device *dev, const char *name)
858 {
859 	char buf[IFNAMSIZ];
860 	struct net *net;
861 	int ret;
862 
863 	BUG_ON(!dev_net(dev));
864 	net = dev_net(dev);
865 	ret = __dev_alloc_name(net, name, buf);
866 	if (ret >= 0)
867 		strlcpy(dev->name, buf, IFNAMSIZ);
868 	return ret;
869 }
870 
871 
872 /**
873  *	dev_change_name - change name of a device
874  *	@dev: device
875  *	@newname: name (or format string) must be at least IFNAMSIZ
876  *
877  *	Change name of a device, can pass format strings "eth%d".
878  *	for wildcarding.
879  */
880 int dev_change_name(struct net_device *dev, const char *newname)
881 {
882 	char oldname[IFNAMSIZ];
883 	int err = 0;
884 	int ret;
885 	struct net *net;
886 
887 	ASSERT_RTNL();
888 	BUG_ON(!dev_net(dev));
889 
890 	net = dev_net(dev);
891 	if (dev->flags & IFF_UP)
892 		return -EBUSY;
893 
894 	if (!dev_valid_name(newname))
895 		return -EINVAL;
896 
897 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
898 		return 0;
899 
900 	memcpy(oldname, dev->name, IFNAMSIZ);
901 
902 	if (strchr(newname, '%')) {
903 		err = dev_alloc_name(dev, newname);
904 		if (err < 0)
905 			return err;
906 	}
907 	else if (__dev_get_by_name(net, newname))
908 		return -EEXIST;
909 	else
910 		strlcpy(dev->name, newname, IFNAMSIZ);
911 
912 rollback:
913 	/* For now only devices in the initial network namespace
914 	 * are in sysfs.
915 	 */
916 	if (net == &init_net) {
917 		ret = device_rename(&dev->dev, dev->name);
918 		if (ret) {
919 			memcpy(dev->name, oldname, IFNAMSIZ);
920 			return ret;
921 		}
922 	}
923 
924 	write_lock_bh(&dev_base_lock);
925 	hlist_del(&dev->name_hlist);
926 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
927 	write_unlock_bh(&dev_base_lock);
928 
929 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
930 	ret = notifier_to_errno(ret);
931 
932 	if (ret) {
933 		if (err) {
934 			printk(KERN_ERR
935 			       "%s: name change rollback failed: %d.\n",
936 			       dev->name, ret);
937 		} else {
938 			err = ret;
939 			memcpy(dev->name, oldname, IFNAMSIZ);
940 			goto rollback;
941 		}
942 	}
943 
944 	return err;
945 }
946 
947 /**
948  *	dev_set_alias - change ifalias of a device
949  *	@dev: device
950  *	@alias: name up to IFALIASZ
951  *	@len: limit of bytes to copy from info
952  *
953  *	Set ifalias for a device,
954  */
955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
956 {
957 	ASSERT_RTNL();
958 
959 	if (len >= IFALIASZ)
960 		return -EINVAL;
961 
962 	if (!len) {
963 		if (dev->ifalias) {
964 			kfree(dev->ifalias);
965 			dev->ifalias = NULL;
966 		}
967 		return 0;
968 	}
969 
970 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
971 	if (!dev->ifalias)
972 		return -ENOMEM;
973 
974 	strlcpy(dev->ifalias, alias, len+1);
975 	return len;
976 }
977 
978 
979 /**
980  *	netdev_features_change - device changes features
981  *	@dev: device to cause notification
982  *
983  *	Called to indicate a device has changed features.
984  */
985 void netdev_features_change(struct net_device *dev)
986 {
987 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
988 }
989 EXPORT_SYMBOL(netdev_features_change);
990 
991 /**
992  *	netdev_state_change - device changes state
993  *	@dev: device to cause notification
994  *
995  *	Called to indicate a device has changed state. This function calls
996  *	the notifier chains for netdev_chain and sends a NEWLINK message
997  *	to the routing socket.
998  */
999 void netdev_state_change(struct net_device *dev)
1000 {
1001 	if (dev->flags & IFF_UP) {
1002 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004 	}
1005 }
1006 
1007 void netdev_bonding_change(struct net_device *dev)
1008 {
1009 	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010 }
1011 EXPORT_SYMBOL(netdev_bonding_change);
1012 
1013 /**
1014  *	dev_load 	- load a network module
1015  *	@net: the applicable net namespace
1016  *	@name: name of interface
1017  *
1018  *	If a network interface is not present and the process has suitable
1019  *	privileges this function loads the module. If module loading is not
1020  *	available in this kernel then it becomes a nop.
1021  */
1022 
1023 void dev_load(struct net *net, const char *name)
1024 {
1025 	struct net_device *dev;
1026 
1027 	read_lock(&dev_base_lock);
1028 	dev = __dev_get_by_name(net, name);
1029 	read_unlock(&dev_base_lock);
1030 
1031 	if (!dev && capable(CAP_SYS_MODULE))
1032 		request_module("%s", name);
1033 }
1034 
1035 /**
1036  *	dev_open	- prepare an interface for use.
1037  *	@dev:	device to open
1038  *
1039  *	Takes a device from down to up state. The device's private open
1040  *	function is invoked and then the multicast lists are loaded. Finally
1041  *	the device is moved into the up state and a %NETDEV_UP message is
1042  *	sent to the netdev notifier chain.
1043  *
1044  *	Calling this function on an active interface is a nop. On a failure
1045  *	a negative errno code is returned.
1046  */
1047 int dev_open(struct net_device *dev)
1048 {
1049 	const struct net_device_ops *ops = dev->netdev_ops;
1050 	int ret = 0;
1051 
1052 	ASSERT_RTNL();
1053 
1054 	/*
1055 	 *	Is it already up?
1056 	 */
1057 
1058 	if (dev->flags & IFF_UP)
1059 		return 0;
1060 
1061 	/*
1062 	 *	Is it even present?
1063 	 */
1064 	if (!netif_device_present(dev))
1065 		return -ENODEV;
1066 
1067 	/*
1068 	 *	Call device private open method
1069 	 */
1070 	set_bit(__LINK_STATE_START, &dev->state);
1071 
1072 	if (ops->ndo_validate_addr)
1073 		ret = ops->ndo_validate_addr(dev);
1074 
1075 	if (!ret && ops->ndo_open)
1076 		ret = ops->ndo_open(dev);
1077 
1078 	/*
1079 	 *	If it went open OK then:
1080 	 */
1081 
1082 	if (ret)
1083 		clear_bit(__LINK_STATE_START, &dev->state);
1084 	else {
1085 		/*
1086 		 *	Set the flags.
1087 		 */
1088 		dev->flags |= IFF_UP;
1089 
1090 		/*
1091 		 *	Enable NET_DMA
1092 		 */
1093 		net_dmaengine_get();
1094 
1095 		/*
1096 		 *	Initialize multicasting status
1097 		 */
1098 		dev_set_rx_mode(dev);
1099 
1100 		/*
1101 		 *	Wakeup transmit queue engine
1102 		 */
1103 		dev_activate(dev);
1104 
1105 		/*
1106 		 *	... and announce new interface.
1107 		 */
1108 		call_netdevice_notifiers(NETDEV_UP, dev);
1109 	}
1110 
1111 	return ret;
1112 }
1113 
1114 /**
1115  *	dev_close - shutdown an interface.
1116  *	@dev: device to shutdown
1117  *
1118  *	This function moves an active device into down state. A
1119  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1120  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1121  *	chain.
1122  */
1123 int dev_close(struct net_device *dev)
1124 {
1125 	const struct net_device_ops *ops = dev->netdev_ops;
1126 	ASSERT_RTNL();
1127 
1128 	might_sleep();
1129 
1130 	if (!(dev->flags & IFF_UP))
1131 		return 0;
1132 
1133 	/*
1134 	 *	Tell people we are going down, so that they can
1135 	 *	prepare to death, when device is still operating.
1136 	 */
1137 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1138 
1139 	clear_bit(__LINK_STATE_START, &dev->state);
1140 
1141 	/* Synchronize to scheduled poll. We cannot touch poll list,
1142 	 * it can be even on different cpu. So just clear netif_running().
1143 	 *
1144 	 * dev->stop() will invoke napi_disable() on all of it's
1145 	 * napi_struct instances on this device.
1146 	 */
1147 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1148 
1149 	dev_deactivate(dev);
1150 
1151 	/*
1152 	 *	Call the device specific close. This cannot fail.
1153 	 *	Only if device is UP
1154 	 *
1155 	 *	We allow it to be called even after a DETACH hot-plug
1156 	 *	event.
1157 	 */
1158 	if (ops->ndo_stop)
1159 		ops->ndo_stop(dev);
1160 
1161 	/*
1162 	 *	Device is now down.
1163 	 */
1164 
1165 	dev->flags &= ~IFF_UP;
1166 
1167 	/*
1168 	 * Tell people we are down
1169 	 */
1170 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1171 
1172 	/*
1173 	 *	Shutdown NET_DMA
1174 	 */
1175 	net_dmaengine_put();
1176 
1177 	return 0;
1178 }
1179 
1180 
1181 /**
1182  *	dev_disable_lro - disable Large Receive Offload on a device
1183  *	@dev: device
1184  *
1185  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1186  *	called under RTNL.  This is needed if received packets may be
1187  *	forwarded to another interface.
1188  */
1189 void dev_disable_lro(struct net_device *dev)
1190 {
1191 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1192 	    dev->ethtool_ops->set_flags) {
1193 		u32 flags = dev->ethtool_ops->get_flags(dev);
1194 		if (flags & ETH_FLAG_LRO) {
1195 			flags &= ~ETH_FLAG_LRO;
1196 			dev->ethtool_ops->set_flags(dev, flags);
1197 		}
1198 	}
1199 	WARN_ON(dev->features & NETIF_F_LRO);
1200 }
1201 EXPORT_SYMBOL(dev_disable_lro);
1202 
1203 
1204 static int dev_boot_phase = 1;
1205 
1206 /*
1207  *	Device change register/unregister. These are not inline or static
1208  *	as we export them to the world.
1209  */
1210 
1211 /**
1212  *	register_netdevice_notifier - register a network notifier block
1213  *	@nb: notifier
1214  *
1215  *	Register a notifier to be called when network device events occur.
1216  *	The notifier passed is linked into the kernel structures and must
1217  *	not be reused until it has been unregistered. A negative errno code
1218  *	is returned on a failure.
1219  *
1220  * 	When registered all registration and up events are replayed
1221  *	to the new notifier to allow device to have a race free
1222  *	view of the network device list.
1223  */
1224 
1225 int register_netdevice_notifier(struct notifier_block *nb)
1226 {
1227 	struct net_device *dev;
1228 	struct net_device *last;
1229 	struct net *net;
1230 	int err;
1231 
1232 	rtnl_lock();
1233 	err = raw_notifier_chain_register(&netdev_chain, nb);
1234 	if (err)
1235 		goto unlock;
1236 	if (dev_boot_phase)
1237 		goto unlock;
1238 	for_each_net(net) {
1239 		for_each_netdev(net, dev) {
1240 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1241 			err = notifier_to_errno(err);
1242 			if (err)
1243 				goto rollback;
1244 
1245 			if (!(dev->flags & IFF_UP))
1246 				continue;
1247 
1248 			nb->notifier_call(nb, NETDEV_UP, dev);
1249 		}
1250 	}
1251 
1252 unlock:
1253 	rtnl_unlock();
1254 	return err;
1255 
1256 rollback:
1257 	last = dev;
1258 	for_each_net(net) {
1259 		for_each_netdev(net, dev) {
1260 			if (dev == last)
1261 				break;
1262 
1263 			if (dev->flags & IFF_UP) {
1264 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1265 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1266 			}
1267 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1268 		}
1269 	}
1270 
1271 	raw_notifier_chain_unregister(&netdev_chain, nb);
1272 	goto unlock;
1273 }
1274 
1275 /**
1276  *	unregister_netdevice_notifier - unregister a network notifier block
1277  *	@nb: notifier
1278  *
1279  *	Unregister a notifier previously registered by
1280  *	register_netdevice_notifier(). The notifier is unlinked into the
1281  *	kernel structures and may then be reused. A negative errno code
1282  *	is returned on a failure.
1283  */
1284 
1285 int unregister_netdevice_notifier(struct notifier_block *nb)
1286 {
1287 	int err;
1288 
1289 	rtnl_lock();
1290 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1291 	rtnl_unlock();
1292 	return err;
1293 }
1294 
1295 /**
1296  *	call_netdevice_notifiers - call all network notifier blocks
1297  *      @val: value passed unmodified to notifier function
1298  *      @dev: net_device pointer passed unmodified to notifier function
1299  *
1300  *	Call all network notifier blocks.  Parameters and return value
1301  *	are as for raw_notifier_call_chain().
1302  */
1303 
1304 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1305 {
1306 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1307 }
1308 
1309 /* When > 0 there are consumers of rx skb time stamps */
1310 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1311 
1312 void net_enable_timestamp(void)
1313 {
1314 	atomic_inc(&netstamp_needed);
1315 }
1316 
1317 void net_disable_timestamp(void)
1318 {
1319 	atomic_dec(&netstamp_needed);
1320 }
1321 
1322 static inline void net_timestamp(struct sk_buff *skb)
1323 {
1324 	if (atomic_read(&netstamp_needed))
1325 		__net_timestamp(skb);
1326 	else
1327 		skb->tstamp.tv64 = 0;
1328 }
1329 
1330 /*
1331  *	Support routine. Sends outgoing frames to any network
1332  *	taps currently in use.
1333  */
1334 
1335 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1336 {
1337 	struct packet_type *ptype;
1338 
1339 	net_timestamp(skb);
1340 
1341 	rcu_read_lock();
1342 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1343 		/* Never send packets back to the socket
1344 		 * they originated from - MvS (miquels@drinkel.ow.org)
1345 		 */
1346 		if ((ptype->dev == dev || !ptype->dev) &&
1347 		    (ptype->af_packet_priv == NULL ||
1348 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1349 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1350 			if (!skb2)
1351 				break;
1352 
1353 			/* skb->nh should be correctly
1354 			   set by sender, so that the second statement is
1355 			   just protection against buggy protocols.
1356 			 */
1357 			skb_reset_mac_header(skb2);
1358 
1359 			if (skb_network_header(skb2) < skb2->data ||
1360 			    skb2->network_header > skb2->tail) {
1361 				if (net_ratelimit())
1362 					printk(KERN_CRIT "protocol %04x is "
1363 					       "buggy, dev %s\n",
1364 					       skb2->protocol, dev->name);
1365 				skb_reset_network_header(skb2);
1366 			}
1367 
1368 			skb2->transport_header = skb2->network_header;
1369 			skb2->pkt_type = PACKET_OUTGOING;
1370 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1371 		}
1372 	}
1373 	rcu_read_unlock();
1374 }
1375 
1376 
1377 static inline void __netif_reschedule(struct Qdisc *q)
1378 {
1379 	struct softnet_data *sd;
1380 	unsigned long flags;
1381 
1382 	local_irq_save(flags);
1383 	sd = &__get_cpu_var(softnet_data);
1384 	q->next_sched = sd->output_queue;
1385 	sd->output_queue = q;
1386 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1387 	local_irq_restore(flags);
1388 }
1389 
1390 void __netif_schedule(struct Qdisc *q)
1391 {
1392 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1393 		__netif_reschedule(q);
1394 }
1395 EXPORT_SYMBOL(__netif_schedule);
1396 
1397 void dev_kfree_skb_irq(struct sk_buff *skb)
1398 {
1399 	if (atomic_dec_and_test(&skb->users)) {
1400 		struct softnet_data *sd;
1401 		unsigned long flags;
1402 
1403 		local_irq_save(flags);
1404 		sd = &__get_cpu_var(softnet_data);
1405 		skb->next = sd->completion_queue;
1406 		sd->completion_queue = skb;
1407 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1408 		local_irq_restore(flags);
1409 	}
1410 }
1411 EXPORT_SYMBOL(dev_kfree_skb_irq);
1412 
1413 void dev_kfree_skb_any(struct sk_buff *skb)
1414 {
1415 	if (in_irq() || irqs_disabled())
1416 		dev_kfree_skb_irq(skb);
1417 	else
1418 		dev_kfree_skb(skb);
1419 }
1420 EXPORT_SYMBOL(dev_kfree_skb_any);
1421 
1422 
1423 /**
1424  * netif_device_detach - mark device as removed
1425  * @dev: network device
1426  *
1427  * Mark device as removed from system and therefore no longer available.
1428  */
1429 void netif_device_detach(struct net_device *dev)
1430 {
1431 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1432 	    netif_running(dev)) {
1433 		netif_stop_queue(dev);
1434 	}
1435 }
1436 EXPORT_SYMBOL(netif_device_detach);
1437 
1438 /**
1439  * netif_device_attach - mark device as attached
1440  * @dev: network device
1441  *
1442  * Mark device as attached from system and restart if needed.
1443  */
1444 void netif_device_attach(struct net_device *dev)
1445 {
1446 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1447 	    netif_running(dev)) {
1448 		netif_wake_queue(dev);
1449 		__netdev_watchdog_up(dev);
1450 	}
1451 }
1452 EXPORT_SYMBOL(netif_device_attach);
1453 
1454 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1455 {
1456 	return ((features & NETIF_F_GEN_CSUM) ||
1457 		((features & NETIF_F_IP_CSUM) &&
1458 		 protocol == htons(ETH_P_IP)) ||
1459 		((features & NETIF_F_IPV6_CSUM) &&
1460 		 protocol == htons(ETH_P_IPV6)));
1461 }
1462 
1463 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1464 {
1465 	if (can_checksum_protocol(dev->features, skb->protocol))
1466 		return true;
1467 
1468 	if (skb->protocol == htons(ETH_P_8021Q)) {
1469 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1470 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1471 					  veh->h_vlan_encapsulated_proto))
1472 			return true;
1473 	}
1474 
1475 	return false;
1476 }
1477 
1478 /*
1479  * Invalidate hardware checksum when packet is to be mangled, and
1480  * complete checksum manually on outgoing path.
1481  */
1482 int skb_checksum_help(struct sk_buff *skb)
1483 {
1484 	__wsum csum;
1485 	int ret = 0, offset;
1486 
1487 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1488 		goto out_set_summed;
1489 
1490 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1491 		/* Let GSO fix up the checksum. */
1492 		goto out_set_summed;
1493 	}
1494 
1495 	offset = skb->csum_start - skb_headroom(skb);
1496 	BUG_ON(offset >= skb_headlen(skb));
1497 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1498 
1499 	offset += skb->csum_offset;
1500 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1501 
1502 	if (skb_cloned(skb) &&
1503 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1504 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1505 		if (ret)
1506 			goto out;
1507 	}
1508 
1509 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1510 out_set_summed:
1511 	skb->ip_summed = CHECKSUM_NONE;
1512 out:
1513 	return ret;
1514 }
1515 
1516 /**
1517  *	skb_gso_segment - Perform segmentation on skb.
1518  *	@skb: buffer to segment
1519  *	@features: features for the output path (see dev->features)
1520  *
1521  *	This function segments the given skb and returns a list of segments.
1522  *
1523  *	It may return NULL if the skb requires no segmentation.  This is
1524  *	only possible when GSO is used for verifying header integrity.
1525  */
1526 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1527 {
1528 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1529 	struct packet_type *ptype;
1530 	__be16 type = skb->protocol;
1531 	int err;
1532 
1533 	skb_reset_mac_header(skb);
1534 	skb->mac_len = skb->network_header - skb->mac_header;
1535 	__skb_pull(skb, skb->mac_len);
1536 
1537 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1538 		struct net_device *dev = skb->dev;
1539 		struct ethtool_drvinfo info = {};
1540 
1541 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1542 			dev->ethtool_ops->get_drvinfo(dev, &info);
1543 
1544 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1545 			"ip_summed=%d",
1546 		     info.driver, dev ? dev->features : 0L,
1547 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1548 		     skb->len, skb->data_len, skb->ip_summed);
1549 
1550 		if (skb_header_cloned(skb) &&
1551 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1552 			return ERR_PTR(err);
1553 	}
1554 
1555 	rcu_read_lock();
1556 	list_for_each_entry_rcu(ptype,
1557 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1558 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1559 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1560 				err = ptype->gso_send_check(skb);
1561 				segs = ERR_PTR(err);
1562 				if (err || skb_gso_ok(skb, features))
1563 					break;
1564 				__skb_push(skb, (skb->data -
1565 						 skb_network_header(skb)));
1566 			}
1567 			segs = ptype->gso_segment(skb, features);
1568 			break;
1569 		}
1570 	}
1571 	rcu_read_unlock();
1572 
1573 	__skb_push(skb, skb->data - skb_mac_header(skb));
1574 
1575 	return segs;
1576 }
1577 
1578 EXPORT_SYMBOL(skb_gso_segment);
1579 
1580 /* Take action when hardware reception checksum errors are detected. */
1581 #ifdef CONFIG_BUG
1582 void netdev_rx_csum_fault(struct net_device *dev)
1583 {
1584 	if (net_ratelimit()) {
1585 		printk(KERN_ERR "%s: hw csum failure.\n",
1586 			dev ? dev->name : "<unknown>");
1587 		dump_stack();
1588 	}
1589 }
1590 EXPORT_SYMBOL(netdev_rx_csum_fault);
1591 #endif
1592 
1593 /* Actually, we should eliminate this check as soon as we know, that:
1594  * 1. IOMMU is present and allows to map all the memory.
1595  * 2. No high memory really exists on this machine.
1596  */
1597 
1598 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1599 {
1600 #ifdef CONFIG_HIGHMEM
1601 	int i;
1602 
1603 	if (dev->features & NETIF_F_HIGHDMA)
1604 		return 0;
1605 
1606 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1607 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1608 			return 1;
1609 
1610 #endif
1611 	return 0;
1612 }
1613 
1614 struct dev_gso_cb {
1615 	void (*destructor)(struct sk_buff *skb);
1616 };
1617 
1618 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1619 
1620 static void dev_gso_skb_destructor(struct sk_buff *skb)
1621 {
1622 	struct dev_gso_cb *cb;
1623 
1624 	do {
1625 		struct sk_buff *nskb = skb->next;
1626 
1627 		skb->next = nskb->next;
1628 		nskb->next = NULL;
1629 		kfree_skb(nskb);
1630 	} while (skb->next);
1631 
1632 	cb = DEV_GSO_CB(skb);
1633 	if (cb->destructor)
1634 		cb->destructor(skb);
1635 }
1636 
1637 /**
1638  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1639  *	@skb: buffer to segment
1640  *
1641  *	This function segments the given skb and stores the list of segments
1642  *	in skb->next.
1643  */
1644 static int dev_gso_segment(struct sk_buff *skb)
1645 {
1646 	struct net_device *dev = skb->dev;
1647 	struct sk_buff *segs;
1648 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1649 					 NETIF_F_SG : 0);
1650 
1651 	segs = skb_gso_segment(skb, features);
1652 
1653 	/* Verifying header integrity only. */
1654 	if (!segs)
1655 		return 0;
1656 
1657 	if (IS_ERR(segs))
1658 		return PTR_ERR(segs);
1659 
1660 	skb->next = segs;
1661 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1662 	skb->destructor = dev_gso_skb_destructor;
1663 
1664 	return 0;
1665 }
1666 
1667 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1668 			struct netdev_queue *txq)
1669 {
1670 	const struct net_device_ops *ops = dev->netdev_ops;
1671 	int rc;
1672 
1673 	if (likely(!skb->next)) {
1674 		if (!list_empty(&ptype_all))
1675 			dev_queue_xmit_nit(skb, dev);
1676 
1677 		if (netif_needs_gso(dev, skb)) {
1678 			if (unlikely(dev_gso_segment(skb)))
1679 				goto out_kfree_skb;
1680 			if (skb->next)
1681 				goto gso;
1682 		}
1683 
1684 		rc = ops->ndo_start_xmit(skb, dev);
1685 		/*
1686 		 * TODO: if skb_orphan() was called by
1687 		 * dev->hard_start_xmit() (for example, the unmodified
1688 		 * igb driver does that; bnx2 doesn't), then
1689 		 * skb_tx_software_timestamp() will be unable to send
1690 		 * back the time stamp.
1691 		 *
1692 		 * How can this be prevented? Always create another
1693 		 * reference to the socket before calling
1694 		 * dev->hard_start_xmit()? Prevent that skb_orphan()
1695 		 * does anything in dev->hard_start_xmit() by clearing
1696 		 * the skb destructor before the call and restoring it
1697 		 * afterwards, then doing the skb_orphan() ourselves?
1698 		 */
1699 		return rc;
1700 	}
1701 
1702 gso:
1703 	do {
1704 		struct sk_buff *nskb = skb->next;
1705 
1706 		skb->next = nskb->next;
1707 		nskb->next = NULL;
1708 		rc = ops->ndo_start_xmit(nskb, dev);
1709 		if (unlikely(rc)) {
1710 			nskb->next = skb->next;
1711 			skb->next = nskb;
1712 			return rc;
1713 		}
1714 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1715 			return NETDEV_TX_BUSY;
1716 	} while (skb->next);
1717 
1718 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1719 
1720 out_kfree_skb:
1721 	kfree_skb(skb);
1722 	return 0;
1723 }
1724 
1725 static u32 skb_tx_hashrnd;
1726 
1727 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1728 {
1729 	u32 hash;
1730 
1731 	if (skb_rx_queue_recorded(skb)) {
1732 		hash = skb_get_rx_queue(skb);
1733 	} else if (skb->sk && skb->sk->sk_hash) {
1734 		hash = skb->sk->sk_hash;
1735 	} else
1736 		hash = skb->protocol;
1737 
1738 	hash = jhash_1word(hash, skb_tx_hashrnd);
1739 
1740 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1741 }
1742 EXPORT_SYMBOL(skb_tx_hash);
1743 
1744 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1745 					struct sk_buff *skb)
1746 {
1747 	const struct net_device_ops *ops = dev->netdev_ops;
1748 	u16 queue_index = 0;
1749 
1750 	if (ops->ndo_select_queue)
1751 		queue_index = ops->ndo_select_queue(dev, skb);
1752 	else if (dev->real_num_tx_queues > 1)
1753 		queue_index = skb_tx_hash(dev, skb);
1754 
1755 	skb_set_queue_mapping(skb, queue_index);
1756 	return netdev_get_tx_queue(dev, queue_index);
1757 }
1758 
1759 /**
1760  *	dev_queue_xmit - transmit a buffer
1761  *	@skb: buffer to transmit
1762  *
1763  *	Queue a buffer for transmission to a network device. The caller must
1764  *	have set the device and priority and built the buffer before calling
1765  *	this function. The function can be called from an interrupt.
1766  *
1767  *	A negative errno code is returned on a failure. A success does not
1768  *	guarantee the frame will be transmitted as it may be dropped due
1769  *	to congestion or traffic shaping.
1770  *
1771  * -----------------------------------------------------------------------------------
1772  *      I notice this method can also return errors from the queue disciplines,
1773  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1774  *      be positive.
1775  *
1776  *      Regardless of the return value, the skb is consumed, so it is currently
1777  *      difficult to retry a send to this method.  (You can bump the ref count
1778  *      before sending to hold a reference for retry if you are careful.)
1779  *
1780  *      When calling this method, interrupts MUST be enabled.  This is because
1781  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1782  *          --BLG
1783  */
1784 int dev_queue_xmit(struct sk_buff *skb)
1785 {
1786 	struct net_device *dev = skb->dev;
1787 	struct netdev_queue *txq;
1788 	struct Qdisc *q;
1789 	int rc = -ENOMEM;
1790 
1791 	/* GSO will handle the following emulations directly. */
1792 	if (netif_needs_gso(dev, skb))
1793 		goto gso;
1794 
1795 	if (skb_shinfo(skb)->frag_list &&
1796 	    !(dev->features & NETIF_F_FRAGLIST) &&
1797 	    __skb_linearize(skb))
1798 		goto out_kfree_skb;
1799 
1800 	/* Fragmented skb is linearized if device does not support SG,
1801 	 * or if at least one of fragments is in highmem and device
1802 	 * does not support DMA from it.
1803 	 */
1804 	if (skb_shinfo(skb)->nr_frags &&
1805 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1806 	    __skb_linearize(skb))
1807 		goto out_kfree_skb;
1808 
1809 	/* If packet is not checksummed and device does not support
1810 	 * checksumming for this protocol, complete checksumming here.
1811 	 */
1812 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1813 		skb_set_transport_header(skb, skb->csum_start -
1814 					      skb_headroom(skb));
1815 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1816 			goto out_kfree_skb;
1817 	}
1818 
1819 gso:
1820 	/* Disable soft irqs for various locks below. Also
1821 	 * stops preemption for RCU.
1822 	 */
1823 	rcu_read_lock_bh();
1824 
1825 	txq = dev_pick_tx(dev, skb);
1826 	q = rcu_dereference(txq->qdisc);
1827 
1828 #ifdef CONFIG_NET_CLS_ACT
1829 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1830 #endif
1831 	if (q->enqueue) {
1832 		spinlock_t *root_lock = qdisc_lock(q);
1833 
1834 		spin_lock(root_lock);
1835 
1836 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1837 			kfree_skb(skb);
1838 			rc = NET_XMIT_DROP;
1839 		} else {
1840 			rc = qdisc_enqueue_root(skb, q);
1841 			qdisc_run(q);
1842 		}
1843 		spin_unlock(root_lock);
1844 
1845 		goto out;
1846 	}
1847 
1848 	/* The device has no queue. Common case for software devices:
1849 	   loopback, all the sorts of tunnels...
1850 
1851 	   Really, it is unlikely that netif_tx_lock protection is necessary
1852 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1853 	   counters.)
1854 	   However, it is possible, that they rely on protection
1855 	   made by us here.
1856 
1857 	   Check this and shot the lock. It is not prone from deadlocks.
1858 	   Either shot noqueue qdisc, it is even simpler 8)
1859 	 */
1860 	if (dev->flags & IFF_UP) {
1861 		int cpu = smp_processor_id(); /* ok because BHs are off */
1862 
1863 		if (txq->xmit_lock_owner != cpu) {
1864 
1865 			HARD_TX_LOCK(dev, txq, cpu);
1866 
1867 			if (!netif_tx_queue_stopped(txq)) {
1868 				rc = 0;
1869 				if (!dev_hard_start_xmit(skb, dev, txq)) {
1870 					HARD_TX_UNLOCK(dev, txq);
1871 					goto out;
1872 				}
1873 			}
1874 			HARD_TX_UNLOCK(dev, txq);
1875 			if (net_ratelimit())
1876 				printk(KERN_CRIT "Virtual device %s asks to "
1877 				       "queue packet!\n", dev->name);
1878 		} else {
1879 			/* Recursion is detected! It is possible,
1880 			 * unfortunately */
1881 			if (net_ratelimit())
1882 				printk(KERN_CRIT "Dead loop on virtual device "
1883 				       "%s, fix it urgently!\n", dev->name);
1884 		}
1885 	}
1886 
1887 	rc = -ENETDOWN;
1888 	rcu_read_unlock_bh();
1889 
1890 out_kfree_skb:
1891 	kfree_skb(skb);
1892 	return rc;
1893 out:
1894 	rcu_read_unlock_bh();
1895 	return rc;
1896 }
1897 
1898 
1899 /*=======================================================================
1900 			Receiver routines
1901   =======================================================================*/
1902 
1903 int netdev_max_backlog __read_mostly = 1000;
1904 int netdev_budget __read_mostly = 300;
1905 int weight_p __read_mostly = 64;            /* old backlog weight */
1906 
1907 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1908 
1909 
1910 /**
1911  *	netif_rx	-	post buffer to the network code
1912  *	@skb: buffer to post
1913  *
1914  *	This function receives a packet from a device driver and queues it for
1915  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1916  *	may be dropped during processing for congestion control or by the
1917  *	protocol layers.
1918  *
1919  *	return values:
1920  *	NET_RX_SUCCESS	(no congestion)
1921  *	NET_RX_DROP     (packet was dropped)
1922  *
1923  */
1924 
1925 int netif_rx(struct sk_buff *skb)
1926 {
1927 	struct softnet_data *queue;
1928 	unsigned long flags;
1929 
1930 	/* if netpoll wants it, pretend we never saw it */
1931 	if (netpoll_rx(skb))
1932 		return NET_RX_DROP;
1933 
1934 	if (!skb->tstamp.tv64)
1935 		net_timestamp(skb);
1936 
1937 	/*
1938 	 * The code is rearranged so that the path is the most
1939 	 * short when CPU is congested, but is still operating.
1940 	 */
1941 	local_irq_save(flags);
1942 	queue = &__get_cpu_var(softnet_data);
1943 
1944 	__get_cpu_var(netdev_rx_stat).total++;
1945 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1946 		if (queue->input_pkt_queue.qlen) {
1947 enqueue:
1948 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1949 			local_irq_restore(flags);
1950 			return NET_RX_SUCCESS;
1951 		}
1952 
1953 		napi_schedule(&queue->backlog);
1954 		goto enqueue;
1955 	}
1956 
1957 	__get_cpu_var(netdev_rx_stat).dropped++;
1958 	local_irq_restore(flags);
1959 
1960 	kfree_skb(skb);
1961 	return NET_RX_DROP;
1962 }
1963 
1964 int netif_rx_ni(struct sk_buff *skb)
1965 {
1966 	int err;
1967 
1968 	preempt_disable();
1969 	err = netif_rx(skb);
1970 	if (local_softirq_pending())
1971 		do_softirq();
1972 	preempt_enable();
1973 
1974 	return err;
1975 }
1976 
1977 EXPORT_SYMBOL(netif_rx_ni);
1978 
1979 static void net_tx_action(struct softirq_action *h)
1980 {
1981 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1982 
1983 	if (sd->completion_queue) {
1984 		struct sk_buff *clist;
1985 
1986 		local_irq_disable();
1987 		clist = sd->completion_queue;
1988 		sd->completion_queue = NULL;
1989 		local_irq_enable();
1990 
1991 		while (clist) {
1992 			struct sk_buff *skb = clist;
1993 			clist = clist->next;
1994 
1995 			WARN_ON(atomic_read(&skb->users));
1996 			__kfree_skb(skb);
1997 		}
1998 	}
1999 
2000 	if (sd->output_queue) {
2001 		struct Qdisc *head;
2002 
2003 		local_irq_disable();
2004 		head = sd->output_queue;
2005 		sd->output_queue = NULL;
2006 		local_irq_enable();
2007 
2008 		while (head) {
2009 			struct Qdisc *q = head;
2010 			spinlock_t *root_lock;
2011 
2012 			head = head->next_sched;
2013 
2014 			root_lock = qdisc_lock(q);
2015 			if (spin_trylock(root_lock)) {
2016 				smp_mb__before_clear_bit();
2017 				clear_bit(__QDISC_STATE_SCHED,
2018 					  &q->state);
2019 				qdisc_run(q);
2020 				spin_unlock(root_lock);
2021 			} else {
2022 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2023 					      &q->state)) {
2024 					__netif_reschedule(q);
2025 				} else {
2026 					smp_mb__before_clear_bit();
2027 					clear_bit(__QDISC_STATE_SCHED,
2028 						  &q->state);
2029 				}
2030 			}
2031 		}
2032 	}
2033 }
2034 
2035 static inline int deliver_skb(struct sk_buff *skb,
2036 			      struct packet_type *pt_prev,
2037 			      struct net_device *orig_dev)
2038 {
2039 	atomic_inc(&skb->users);
2040 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2041 }
2042 
2043 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2044 /* These hooks defined here for ATM */
2045 struct net_bridge;
2046 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2047 						unsigned char *addr);
2048 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2049 
2050 /*
2051  * If bridge module is loaded call bridging hook.
2052  *  returns NULL if packet was consumed.
2053  */
2054 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2055 					struct sk_buff *skb) __read_mostly;
2056 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2057 					    struct packet_type **pt_prev, int *ret,
2058 					    struct net_device *orig_dev)
2059 {
2060 	struct net_bridge_port *port;
2061 
2062 	if (skb->pkt_type == PACKET_LOOPBACK ||
2063 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2064 		return skb;
2065 
2066 	if (*pt_prev) {
2067 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2068 		*pt_prev = NULL;
2069 	}
2070 
2071 	return br_handle_frame_hook(port, skb);
2072 }
2073 #else
2074 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2075 #endif
2076 
2077 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2078 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2079 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2080 
2081 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2082 					     struct packet_type **pt_prev,
2083 					     int *ret,
2084 					     struct net_device *orig_dev)
2085 {
2086 	if (skb->dev->macvlan_port == NULL)
2087 		return skb;
2088 
2089 	if (*pt_prev) {
2090 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2091 		*pt_prev = NULL;
2092 	}
2093 	return macvlan_handle_frame_hook(skb);
2094 }
2095 #else
2096 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2097 #endif
2098 
2099 #ifdef CONFIG_NET_CLS_ACT
2100 /* TODO: Maybe we should just force sch_ingress to be compiled in
2101  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2102  * a compare and 2 stores extra right now if we dont have it on
2103  * but have CONFIG_NET_CLS_ACT
2104  * NOTE: This doesnt stop any functionality; if you dont have
2105  * the ingress scheduler, you just cant add policies on ingress.
2106  *
2107  */
2108 static int ing_filter(struct sk_buff *skb)
2109 {
2110 	struct net_device *dev = skb->dev;
2111 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2112 	struct netdev_queue *rxq;
2113 	int result = TC_ACT_OK;
2114 	struct Qdisc *q;
2115 
2116 	if (MAX_RED_LOOP < ttl++) {
2117 		printk(KERN_WARNING
2118 		       "Redir loop detected Dropping packet (%d->%d)\n",
2119 		       skb->iif, dev->ifindex);
2120 		return TC_ACT_SHOT;
2121 	}
2122 
2123 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2124 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2125 
2126 	rxq = &dev->rx_queue;
2127 
2128 	q = rxq->qdisc;
2129 	if (q != &noop_qdisc) {
2130 		spin_lock(qdisc_lock(q));
2131 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2132 			result = qdisc_enqueue_root(skb, q);
2133 		spin_unlock(qdisc_lock(q));
2134 	}
2135 
2136 	return result;
2137 }
2138 
2139 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2140 					 struct packet_type **pt_prev,
2141 					 int *ret, struct net_device *orig_dev)
2142 {
2143 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2144 		goto out;
2145 
2146 	if (*pt_prev) {
2147 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2148 		*pt_prev = NULL;
2149 	} else {
2150 		/* Huh? Why does turning on AF_PACKET affect this? */
2151 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2152 	}
2153 
2154 	switch (ing_filter(skb)) {
2155 	case TC_ACT_SHOT:
2156 	case TC_ACT_STOLEN:
2157 		kfree_skb(skb);
2158 		return NULL;
2159 	}
2160 
2161 out:
2162 	skb->tc_verd = 0;
2163 	return skb;
2164 }
2165 #endif
2166 
2167 /*
2168  * 	netif_nit_deliver - deliver received packets to network taps
2169  * 	@skb: buffer
2170  *
2171  * 	This function is used to deliver incoming packets to network
2172  * 	taps. It should be used when the normal netif_receive_skb path
2173  * 	is bypassed, for example because of VLAN acceleration.
2174  */
2175 void netif_nit_deliver(struct sk_buff *skb)
2176 {
2177 	struct packet_type *ptype;
2178 
2179 	if (list_empty(&ptype_all))
2180 		return;
2181 
2182 	skb_reset_network_header(skb);
2183 	skb_reset_transport_header(skb);
2184 	skb->mac_len = skb->network_header - skb->mac_header;
2185 
2186 	rcu_read_lock();
2187 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2188 		if (!ptype->dev || ptype->dev == skb->dev)
2189 			deliver_skb(skb, ptype, skb->dev);
2190 	}
2191 	rcu_read_unlock();
2192 }
2193 
2194 /**
2195  *	netif_receive_skb - process receive buffer from network
2196  *	@skb: buffer to process
2197  *
2198  *	netif_receive_skb() is the main receive data processing function.
2199  *	It always succeeds. The buffer may be dropped during processing
2200  *	for congestion control or by the protocol layers.
2201  *
2202  *	This function may only be called from softirq context and interrupts
2203  *	should be enabled.
2204  *
2205  *	Return values (usually ignored):
2206  *	NET_RX_SUCCESS: no congestion
2207  *	NET_RX_DROP: packet was dropped
2208  */
2209 int netif_receive_skb(struct sk_buff *skb)
2210 {
2211 	struct packet_type *ptype, *pt_prev;
2212 	struct net_device *orig_dev;
2213 	struct net_device *null_or_orig;
2214 	int ret = NET_RX_DROP;
2215 	__be16 type;
2216 
2217 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2218 		return NET_RX_SUCCESS;
2219 
2220 	/* if we've gotten here through NAPI, check netpoll */
2221 	if (netpoll_receive_skb(skb))
2222 		return NET_RX_DROP;
2223 
2224 	if (!skb->tstamp.tv64)
2225 		net_timestamp(skb);
2226 
2227 	if (!skb->iif)
2228 		skb->iif = skb->dev->ifindex;
2229 
2230 	null_or_orig = NULL;
2231 	orig_dev = skb->dev;
2232 	if (orig_dev->master) {
2233 		if (skb_bond_should_drop(skb))
2234 			null_or_orig = orig_dev; /* deliver only exact match */
2235 		else
2236 			skb->dev = orig_dev->master;
2237 	}
2238 
2239 	__get_cpu_var(netdev_rx_stat).total++;
2240 
2241 	skb_reset_network_header(skb);
2242 	skb_reset_transport_header(skb);
2243 	skb->mac_len = skb->network_header - skb->mac_header;
2244 
2245 	pt_prev = NULL;
2246 
2247 	rcu_read_lock();
2248 
2249 #ifdef CONFIG_NET_CLS_ACT
2250 	if (skb->tc_verd & TC_NCLS) {
2251 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2252 		goto ncls;
2253 	}
2254 #endif
2255 
2256 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2257 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2258 		    ptype->dev == orig_dev) {
2259 			if (pt_prev)
2260 				ret = deliver_skb(skb, pt_prev, orig_dev);
2261 			pt_prev = ptype;
2262 		}
2263 	}
2264 
2265 #ifdef CONFIG_NET_CLS_ACT
2266 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2267 	if (!skb)
2268 		goto out;
2269 ncls:
2270 #endif
2271 
2272 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2273 	if (!skb)
2274 		goto out;
2275 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2276 	if (!skb)
2277 		goto out;
2278 
2279 	skb_orphan(skb);
2280 
2281 	type = skb->protocol;
2282 	list_for_each_entry_rcu(ptype,
2283 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2284 		if (ptype->type == type &&
2285 		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2286 		     ptype->dev == orig_dev)) {
2287 			if (pt_prev)
2288 				ret = deliver_skb(skb, pt_prev, orig_dev);
2289 			pt_prev = ptype;
2290 		}
2291 	}
2292 
2293 	if (pt_prev) {
2294 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2295 	} else {
2296 		kfree_skb(skb);
2297 		/* Jamal, now you will not able to escape explaining
2298 		 * me how you were going to use this. :-)
2299 		 */
2300 		ret = NET_RX_DROP;
2301 	}
2302 
2303 out:
2304 	rcu_read_unlock();
2305 	return ret;
2306 }
2307 
2308 /* Network device is going away, flush any packets still pending  */
2309 static void flush_backlog(void *arg)
2310 {
2311 	struct net_device *dev = arg;
2312 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2313 	struct sk_buff *skb, *tmp;
2314 
2315 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2316 		if (skb->dev == dev) {
2317 			__skb_unlink(skb, &queue->input_pkt_queue);
2318 			kfree_skb(skb);
2319 		}
2320 }
2321 
2322 static int napi_gro_complete(struct sk_buff *skb)
2323 {
2324 	struct packet_type *ptype;
2325 	__be16 type = skb->protocol;
2326 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2327 	int err = -ENOENT;
2328 
2329 	if (NAPI_GRO_CB(skb)->count == 1)
2330 		goto out;
2331 
2332 	rcu_read_lock();
2333 	list_for_each_entry_rcu(ptype, head, list) {
2334 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2335 			continue;
2336 
2337 		err = ptype->gro_complete(skb);
2338 		break;
2339 	}
2340 	rcu_read_unlock();
2341 
2342 	if (err) {
2343 		WARN_ON(&ptype->list == head);
2344 		kfree_skb(skb);
2345 		return NET_RX_SUCCESS;
2346 	}
2347 
2348 out:
2349 	skb_shinfo(skb)->gso_size = 0;
2350 	return netif_receive_skb(skb);
2351 }
2352 
2353 void napi_gro_flush(struct napi_struct *napi)
2354 {
2355 	struct sk_buff *skb, *next;
2356 
2357 	for (skb = napi->gro_list; skb; skb = next) {
2358 		next = skb->next;
2359 		skb->next = NULL;
2360 		napi_gro_complete(skb);
2361 	}
2362 
2363 	napi->gro_count = 0;
2364 	napi->gro_list = NULL;
2365 }
2366 EXPORT_SYMBOL(napi_gro_flush);
2367 
2368 void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
2369 {
2370 	unsigned int offset = skb_gro_offset(skb);
2371 
2372 	hlen += offset;
2373 	if (hlen <= skb_headlen(skb))
2374 		return skb->data + offset;
2375 
2376 	if (unlikely(!skb_shinfo(skb)->nr_frags ||
2377 		     skb_shinfo(skb)->frags[0].size <=
2378 		     hlen - skb_headlen(skb) ||
2379 		     PageHighMem(skb_shinfo(skb)->frags[0].page)))
2380 		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
2381 
2382 	return page_address(skb_shinfo(skb)->frags[0].page) +
2383 	       skb_shinfo(skb)->frags[0].page_offset +
2384 	       offset - skb_headlen(skb);
2385 }
2386 EXPORT_SYMBOL(skb_gro_header);
2387 
2388 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2389 {
2390 	struct sk_buff **pp = NULL;
2391 	struct packet_type *ptype;
2392 	__be16 type = skb->protocol;
2393 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2394 	int same_flow;
2395 	int mac_len;
2396 	int ret;
2397 
2398 	if (!(skb->dev->features & NETIF_F_GRO))
2399 		goto normal;
2400 
2401 	if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2402 		goto normal;
2403 
2404 	rcu_read_lock();
2405 	list_for_each_entry_rcu(ptype, head, list) {
2406 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2407 			continue;
2408 
2409 		skb_set_network_header(skb, skb_gro_offset(skb));
2410 		mac_len = skb->network_header - skb->mac_header;
2411 		skb->mac_len = mac_len;
2412 		NAPI_GRO_CB(skb)->same_flow = 0;
2413 		NAPI_GRO_CB(skb)->flush = 0;
2414 		NAPI_GRO_CB(skb)->free = 0;
2415 
2416 		pp = ptype->gro_receive(&napi->gro_list, skb);
2417 		break;
2418 	}
2419 	rcu_read_unlock();
2420 
2421 	if (&ptype->list == head)
2422 		goto normal;
2423 
2424 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2425 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2426 
2427 	if (pp) {
2428 		struct sk_buff *nskb = *pp;
2429 
2430 		*pp = nskb->next;
2431 		nskb->next = NULL;
2432 		napi_gro_complete(nskb);
2433 		napi->gro_count--;
2434 	}
2435 
2436 	if (same_flow)
2437 		goto ok;
2438 
2439 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2440 		goto normal;
2441 
2442 	napi->gro_count++;
2443 	NAPI_GRO_CB(skb)->count = 1;
2444 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2445 	skb->next = napi->gro_list;
2446 	napi->gro_list = skb;
2447 	ret = GRO_HELD;
2448 
2449 pull:
2450 	if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) {
2451 		if (napi->gro_list == skb)
2452 			napi->gro_list = skb->next;
2453 		ret = GRO_DROP;
2454 	}
2455 
2456 ok:
2457 	return ret;
2458 
2459 normal:
2460 	ret = GRO_NORMAL;
2461 	goto pull;
2462 }
2463 EXPORT_SYMBOL(dev_gro_receive);
2464 
2465 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2466 {
2467 	struct sk_buff *p;
2468 
2469 	if (netpoll_rx_on(skb))
2470 		return GRO_NORMAL;
2471 
2472 	for (p = napi->gro_list; p; p = p->next) {
2473 		NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
2474 			skb_mac_header(p), skb_gro_mac_header(skb));
2475 		NAPI_GRO_CB(p)->flush = 0;
2476 	}
2477 
2478 	return dev_gro_receive(napi, skb);
2479 }
2480 
2481 int napi_skb_finish(int ret, struct sk_buff *skb)
2482 {
2483 	int err = NET_RX_SUCCESS;
2484 
2485 	switch (ret) {
2486 	case GRO_NORMAL:
2487 		return netif_receive_skb(skb);
2488 
2489 	case GRO_DROP:
2490 		err = NET_RX_DROP;
2491 		/* fall through */
2492 
2493 	case GRO_MERGED_FREE:
2494 		kfree_skb(skb);
2495 		break;
2496 	}
2497 
2498 	return err;
2499 }
2500 EXPORT_SYMBOL(napi_skb_finish);
2501 
2502 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2503 {
2504 	skb_gro_reset_offset(skb);
2505 
2506 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2507 }
2508 EXPORT_SYMBOL(napi_gro_receive);
2509 
2510 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2511 {
2512 	__skb_pull(skb, skb_headlen(skb));
2513 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2514 
2515 	napi->skb = skb;
2516 }
2517 EXPORT_SYMBOL(napi_reuse_skb);
2518 
2519 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2520 				  struct napi_gro_fraginfo *info)
2521 {
2522 	struct net_device *dev = napi->dev;
2523 	struct sk_buff *skb = napi->skb;
2524 	struct ethhdr *eth;
2525 	skb_frag_t *frag;
2526 	int i;
2527 
2528 	napi->skb = NULL;
2529 
2530 	if (!skb) {
2531 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2532 		if (!skb)
2533 			goto out;
2534 
2535 		skb_reserve(skb, NET_IP_ALIGN);
2536 	}
2537 
2538 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2539 	frag = &info->frags[info->nr_frags - 1];
2540 
2541 	for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
2542 		skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
2543 				   frag->size);
2544 		frag++;
2545 	}
2546 	skb_shinfo(skb)->nr_frags = info->nr_frags;
2547 
2548 	skb->data_len = info->len;
2549 	skb->len += info->len;
2550 	skb->truesize += info->len;
2551 
2552 	skb_reset_mac_header(skb);
2553 	skb_gro_reset_offset(skb);
2554 
2555 	eth = skb_gro_header(skb, sizeof(*eth));
2556 	if (!eth) {
2557 		napi_reuse_skb(napi, skb);
2558 		skb = NULL;
2559 		goto out;
2560 	}
2561 
2562 	skb_gro_pull(skb, sizeof(*eth));
2563 
2564 	/*
2565 	 * This works because the only protocols we care about don't require
2566 	 * special handling.  We'll fix it up properly at the end.
2567 	 */
2568 	skb->protocol = eth->h_proto;
2569 
2570 	skb->ip_summed = info->ip_summed;
2571 	skb->csum = info->csum;
2572 
2573 out:
2574 	return skb;
2575 }
2576 EXPORT_SYMBOL(napi_fraginfo_skb);
2577 
2578 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2579 {
2580 	int err = NET_RX_SUCCESS;
2581 
2582 	switch (ret) {
2583 	case GRO_NORMAL:
2584 	case GRO_HELD:
2585 		skb->protocol = eth_type_trans(skb, napi->dev);
2586 
2587 		if (ret == GRO_NORMAL)
2588 			return netif_receive_skb(skb);
2589 
2590 		skb_gro_pull(skb, -ETH_HLEN);
2591 		break;
2592 
2593 	case GRO_DROP:
2594 		err = NET_RX_DROP;
2595 		/* fall through */
2596 
2597 	case GRO_MERGED_FREE:
2598 		napi_reuse_skb(napi, skb);
2599 		break;
2600 	}
2601 
2602 	return err;
2603 }
2604 EXPORT_SYMBOL(napi_frags_finish);
2605 
2606 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2607 {
2608 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2609 
2610 	if (!skb)
2611 		return NET_RX_DROP;
2612 
2613 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2614 }
2615 EXPORT_SYMBOL(napi_gro_frags);
2616 
2617 static int process_backlog(struct napi_struct *napi, int quota)
2618 {
2619 	int work = 0;
2620 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2621 	unsigned long start_time = jiffies;
2622 
2623 	napi->weight = weight_p;
2624 	do {
2625 		struct sk_buff *skb;
2626 
2627 		local_irq_disable();
2628 		skb = __skb_dequeue(&queue->input_pkt_queue);
2629 		if (!skb) {
2630 			__napi_complete(napi);
2631 			local_irq_enable();
2632 			break;
2633 		}
2634 		local_irq_enable();
2635 
2636 		netif_receive_skb(skb);
2637 	} while (++work < quota && jiffies == start_time);
2638 
2639 	return work;
2640 }
2641 
2642 /**
2643  * __napi_schedule - schedule for receive
2644  * @n: entry to schedule
2645  *
2646  * The entry's receive function will be scheduled to run
2647  */
2648 void __napi_schedule(struct napi_struct *n)
2649 {
2650 	unsigned long flags;
2651 
2652 	local_irq_save(flags);
2653 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2654 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2655 	local_irq_restore(flags);
2656 }
2657 EXPORT_SYMBOL(__napi_schedule);
2658 
2659 void __napi_complete(struct napi_struct *n)
2660 {
2661 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2662 	BUG_ON(n->gro_list);
2663 
2664 	list_del(&n->poll_list);
2665 	smp_mb__before_clear_bit();
2666 	clear_bit(NAPI_STATE_SCHED, &n->state);
2667 }
2668 EXPORT_SYMBOL(__napi_complete);
2669 
2670 void napi_complete(struct napi_struct *n)
2671 {
2672 	unsigned long flags;
2673 
2674 	/*
2675 	 * don't let napi dequeue from the cpu poll list
2676 	 * just in case its running on a different cpu
2677 	 */
2678 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2679 		return;
2680 
2681 	napi_gro_flush(n);
2682 	local_irq_save(flags);
2683 	__napi_complete(n);
2684 	local_irq_restore(flags);
2685 }
2686 EXPORT_SYMBOL(napi_complete);
2687 
2688 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2689 		    int (*poll)(struct napi_struct *, int), int weight)
2690 {
2691 	INIT_LIST_HEAD(&napi->poll_list);
2692 	napi->gro_count = 0;
2693 	napi->gro_list = NULL;
2694 	napi->skb = NULL;
2695 	napi->poll = poll;
2696 	napi->weight = weight;
2697 	list_add(&napi->dev_list, &dev->napi_list);
2698 	napi->dev = dev;
2699 #ifdef CONFIG_NETPOLL
2700 	spin_lock_init(&napi->poll_lock);
2701 	napi->poll_owner = -1;
2702 #endif
2703 	set_bit(NAPI_STATE_SCHED, &napi->state);
2704 }
2705 EXPORT_SYMBOL(netif_napi_add);
2706 
2707 void netif_napi_del(struct napi_struct *napi)
2708 {
2709 	struct sk_buff *skb, *next;
2710 
2711 	list_del_init(&napi->dev_list);
2712 	kfree_skb(napi->skb);
2713 
2714 	for (skb = napi->gro_list; skb; skb = next) {
2715 		next = skb->next;
2716 		skb->next = NULL;
2717 		kfree_skb(skb);
2718 	}
2719 
2720 	napi->gro_list = NULL;
2721 	napi->gro_count = 0;
2722 }
2723 EXPORT_SYMBOL(netif_napi_del);
2724 
2725 
2726 static void net_rx_action(struct softirq_action *h)
2727 {
2728 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2729 	unsigned long time_limit = jiffies + 2;
2730 	int budget = netdev_budget;
2731 	void *have;
2732 
2733 	local_irq_disable();
2734 
2735 	while (!list_empty(list)) {
2736 		struct napi_struct *n;
2737 		int work, weight;
2738 
2739 		/* If softirq window is exhuasted then punt.
2740 		 * Allow this to run for 2 jiffies since which will allow
2741 		 * an average latency of 1.5/HZ.
2742 		 */
2743 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2744 			goto softnet_break;
2745 
2746 		local_irq_enable();
2747 
2748 		/* Even though interrupts have been re-enabled, this
2749 		 * access is safe because interrupts can only add new
2750 		 * entries to the tail of this list, and only ->poll()
2751 		 * calls can remove this head entry from the list.
2752 		 */
2753 		n = list_entry(list->next, struct napi_struct, poll_list);
2754 
2755 		have = netpoll_poll_lock(n);
2756 
2757 		weight = n->weight;
2758 
2759 		/* This NAPI_STATE_SCHED test is for avoiding a race
2760 		 * with netpoll's poll_napi().  Only the entity which
2761 		 * obtains the lock and sees NAPI_STATE_SCHED set will
2762 		 * actually make the ->poll() call.  Therefore we avoid
2763 		 * accidently calling ->poll() when NAPI is not scheduled.
2764 		 */
2765 		work = 0;
2766 		if (test_bit(NAPI_STATE_SCHED, &n->state))
2767 			work = n->poll(n, weight);
2768 
2769 		WARN_ON_ONCE(work > weight);
2770 
2771 		budget -= work;
2772 
2773 		local_irq_disable();
2774 
2775 		/* Drivers must not modify the NAPI state if they
2776 		 * consume the entire weight.  In such cases this code
2777 		 * still "owns" the NAPI instance and therefore can
2778 		 * move the instance around on the list at-will.
2779 		 */
2780 		if (unlikely(work == weight)) {
2781 			if (unlikely(napi_disable_pending(n)))
2782 				__napi_complete(n);
2783 			else
2784 				list_move_tail(&n->poll_list, list);
2785 		}
2786 
2787 		netpoll_poll_unlock(have);
2788 	}
2789 out:
2790 	local_irq_enable();
2791 
2792 #ifdef CONFIG_NET_DMA
2793 	/*
2794 	 * There may not be any more sk_buffs coming right now, so push
2795 	 * any pending DMA copies to hardware
2796 	 */
2797 	dma_issue_pending_all();
2798 #endif
2799 
2800 	return;
2801 
2802 softnet_break:
2803 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2804 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2805 	goto out;
2806 }
2807 
2808 static gifconf_func_t * gifconf_list [NPROTO];
2809 
2810 /**
2811  *	register_gifconf	-	register a SIOCGIF handler
2812  *	@family: Address family
2813  *	@gifconf: Function handler
2814  *
2815  *	Register protocol dependent address dumping routines. The handler
2816  *	that is passed must not be freed or reused until it has been replaced
2817  *	by another handler.
2818  */
2819 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2820 {
2821 	if (family >= NPROTO)
2822 		return -EINVAL;
2823 	gifconf_list[family] = gifconf;
2824 	return 0;
2825 }
2826 
2827 
2828 /*
2829  *	Map an interface index to its name (SIOCGIFNAME)
2830  */
2831 
2832 /*
2833  *	We need this ioctl for efficient implementation of the
2834  *	if_indextoname() function required by the IPv6 API.  Without
2835  *	it, we would have to search all the interfaces to find a
2836  *	match.  --pb
2837  */
2838 
2839 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2840 {
2841 	struct net_device *dev;
2842 	struct ifreq ifr;
2843 
2844 	/*
2845 	 *	Fetch the caller's info block.
2846 	 */
2847 
2848 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2849 		return -EFAULT;
2850 
2851 	read_lock(&dev_base_lock);
2852 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2853 	if (!dev) {
2854 		read_unlock(&dev_base_lock);
2855 		return -ENODEV;
2856 	}
2857 
2858 	strcpy(ifr.ifr_name, dev->name);
2859 	read_unlock(&dev_base_lock);
2860 
2861 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2862 		return -EFAULT;
2863 	return 0;
2864 }
2865 
2866 /*
2867  *	Perform a SIOCGIFCONF call. This structure will change
2868  *	size eventually, and there is nothing I can do about it.
2869  *	Thus we will need a 'compatibility mode'.
2870  */
2871 
2872 static int dev_ifconf(struct net *net, char __user *arg)
2873 {
2874 	struct ifconf ifc;
2875 	struct net_device *dev;
2876 	char __user *pos;
2877 	int len;
2878 	int total;
2879 	int i;
2880 
2881 	/*
2882 	 *	Fetch the caller's info block.
2883 	 */
2884 
2885 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2886 		return -EFAULT;
2887 
2888 	pos = ifc.ifc_buf;
2889 	len = ifc.ifc_len;
2890 
2891 	/*
2892 	 *	Loop over the interfaces, and write an info block for each.
2893 	 */
2894 
2895 	total = 0;
2896 	for_each_netdev(net, dev) {
2897 		for (i = 0; i < NPROTO; i++) {
2898 			if (gifconf_list[i]) {
2899 				int done;
2900 				if (!pos)
2901 					done = gifconf_list[i](dev, NULL, 0);
2902 				else
2903 					done = gifconf_list[i](dev, pos + total,
2904 							       len - total);
2905 				if (done < 0)
2906 					return -EFAULT;
2907 				total += done;
2908 			}
2909 		}
2910 	}
2911 
2912 	/*
2913 	 *	All done.  Write the updated control block back to the caller.
2914 	 */
2915 	ifc.ifc_len = total;
2916 
2917 	/*
2918 	 * 	Both BSD and Solaris return 0 here, so we do too.
2919 	 */
2920 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2921 }
2922 
2923 #ifdef CONFIG_PROC_FS
2924 /*
2925  *	This is invoked by the /proc filesystem handler to display a device
2926  *	in detail.
2927  */
2928 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2929 	__acquires(dev_base_lock)
2930 {
2931 	struct net *net = seq_file_net(seq);
2932 	loff_t off;
2933 	struct net_device *dev;
2934 
2935 	read_lock(&dev_base_lock);
2936 	if (!*pos)
2937 		return SEQ_START_TOKEN;
2938 
2939 	off = 1;
2940 	for_each_netdev(net, dev)
2941 		if (off++ == *pos)
2942 			return dev;
2943 
2944 	return NULL;
2945 }
2946 
2947 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2948 {
2949 	struct net *net = seq_file_net(seq);
2950 	++*pos;
2951 	return v == SEQ_START_TOKEN ?
2952 		first_net_device(net) : next_net_device((struct net_device *)v);
2953 }
2954 
2955 void dev_seq_stop(struct seq_file *seq, void *v)
2956 	__releases(dev_base_lock)
2957 {
2958 	read_unlock(&dev_base_lock);
2959 }
2960 
2961 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2962 {
2963 	const struct net_device_stats *stats = dev_get_stats(dev);
2964 
2965 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2966 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2967 		   dev->name, stats->rx_bytes, stats->rx_packets,
2968 		   stats->rx_errors,
2969 		   stats->rx_dropped + stats->rx_missed_errors,
2970 		   stats->rx_fifo_errors,
2971 		   stats->rx_length_errors + stats->rx_over_errors +
2972 		    stats->rx_crc_errors + stats->rx_frame_errors,
2973 		   stats->rx_compressed, stats->multicast,
2974 		   stats->tx_bytes, stats->tx_packets,
2975 		   stats->tx_errors, stats->tx_dropped,
2976 		   stats->tx_fifo_errors, stats->collisions,
2977 		   stats->tx_carrier_errors +
2978 		    stats->tx_aborted_errors +
2979 		    stats->tx_window_errors +
2980 		    stats->tx_heartbeat_errors,
2981 		   stats->tx_compressed);
2982 }
2983 
2984 /*
2985  *	Called from the PROCfs module. This now uses the new arbitrary sized
2986  *	/proc/net interface to create /proc/net/dev
2987  */
2988 static int dev_seq_show(struct seq_file *seq, void *v)
2989 {
2990 	if (v == SEQ_START_TOKEN)
2991 		seq_puts(seq, "Inter-|   Receive                            "
2992 			      "                    |  Transmit\n"
2993 			      " face |bytes    packets errs drop fifo frame "
2994 			      "compressed multicast|bytes    packets errs "
2995 			      "drop fifo colls carrier compressed\n");
2996 	else
2997 		dev_seq_printf_stats(seq, v);
2998 	return 0;
2999 }
3000 
3001 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3002 {
3003 	struct netif_rx_stats *rc = NULL;
3004 
3005 	while (*pos < nr_cpu_ids)
3006 		if (cpu_online(*pos)) {
3007 			rc = &per_cpu(netdev_rx_stat, *pos);
3008 			break;
3009 		} else
3010 			++*pos;
3011 	return rc;
3012 }
3013 
3014 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3015 {
3016 	return softnet_get_online(pos);
3017 }
3018 
3019 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3020 {
3021 	++*pos;
3022 	return softnet_get_online(pos);
3023 }
3024 
3025 static void softnet_seq_stop(struct seq_file *seq, void *v)
3026 {
3027 }
3028 
3029 static int softnet_seq_show(struct seq_file *seq, void *v)
3030 {
3031 	struct netif_rx_stats *s = v;
3032 
3033 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3034 		   s->total, s->dropped, s->time_squeeze, 0,
3035 		   0, 0, 0, 0, /* was fastroute */
3036 		   s->cpu_collision );
3037 	return 0;
3038 }
3039 
3040 static const struct seq_operations dev_seq_ops = {
3041 	.start = dev_seq_start,
3042 	.next  = dev_seq_next,
3043 	.stop  = dev_seq_stop,
3044 	.show  = dev_seq_show,
3045 };
3046 
3047 static int dev_seq_open(struct inode *inode, struct file *file)
3048 {
3049 	return seq_open_net(inode, file, &dev_seq_ops,
3050 			    sizeof(struct seq_net_private));
3051 }
3052 
3053 static const struct file_operations dev_seq_fops = {
3054 	.owner	 = THIS_MODULE,
3055 	.open    = dev_seq_open,
3056 	.read    = seq_read,
3057 	.llseek  = seq_lseek,
3058 	.release = seq_release_net,
3059 };
3060 
3061 static const struct seq_operations softnet_seq_ops = {
3062 	.start = softnet_seq_start,
3063 	.next  = softnet_seq_next,
3064 	.stop  = softnet_seq_stop,
3065 	.show  = softnet_seq_show,
3066 };
3067 
3068 static int softnet_seq_open(struct inode *inode, struct file *file)
3069 {
3070 	return seq_open(file, &softnet_seq_ops);
3071 }
3072 
3073 static const struct file_operations softnet_seq_fops = {
3074 	.owner	 = THIS_MODULE,
3075 	.open    = softnet_seq_open,
3076 	.read    = seq_read,
3077 	.llseek  = seq_lseek,
3078 	.release = seq_release,
3079 };
3080 
3081 static void *ptype_get_idx(loff_t pos)
3082 {
3083 	struct packet_type *pt = NULL;
3084 	loff_t i = 0;
3085 	int t;
3086 
3087 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3088 		if (i == pos)
3089 			return pt;
3090 		++i;
3091 	}
3092 
3093 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3094 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3095 			if (i == pos)
3096 				return pt;
3097 			++i;
3098 		}
3099 	}
3100 	return NULL;
3101 }
3102 
3103 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3104 	__acquires(RCU)
3105 {
3106 	rcu_read_lock();
3107 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3108 }
3109 
3110 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3111 {
3112 	struct packet_type *pt;
3113 	struct list_head *nxt;
3114 	int hash;
3115 
3116 	++*pos;
3117 	if (v == SEQ_START_TOKEN)
3118 		return ptype_get_idx(0);
3119 
3120 	pt = v;
3121 	nxt = pt->list.next;
3122 	if (pt->type == htons(ETH_P_ALL)) {
3123 		if (nxt != &ptype_all)
3124 			goto found;
3125 		hash = 0;
3126 		nxt = ptype_base[0].next;
3127 	} else
3128 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3129 
3130 	while (nxt == &ptype_base[hash]) {
3131 		if (++hash >= PTYPE_HASH_SIZE)
3132 			return NULL;
3133 		nxt = ptype_base[hash].next;
3134 	}
3135 found:
3136 	return list_entry(nxt, struct packet_type, list);
3137 }
3138 
3139 static void ptype_seq_stop(struct seq_file *seq, void *v)
3140 	__releases(RCU)
3141 {
3142 	rcu_read_unlock();
3143 }
3144 
3145 static int ptype_seq_show(struct seq_file *seq, void *v)
3146 {
3147 	struct packet_type *pt = v;
3148 
3149 	if (v == SEQ_START_TOKEN)
3150 		seq_puts(seq, "Type Device      Function\n");
3151 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3152 		if (pt->type == htons(ETH_P_ALL))
3153 			seq_puts(seq, "ALL ");
3154 		else
3155 			seq_printf(seq, "%04x", ntohs(pt->type));
3156 
3157 		seq_printf(seq, " %-8s %pF\n",
3158 			   pt->dev ? pt->dev->name : "", pt->func);
3159 	}
3160 
3161 	return 0;
3162 }
3163 
3164 static const struct seq_operations ptype_seq_ops = {
3165 	.start = ptype_seq_start,
3166 	.next  = ptype_seq_next,
3167 	.stop  = ptype_seq_stop,
3168 	.show  = ptype_seq_show,
3169 };
3170 
3171 static int ptype_seq_open(struct inode *inode, struct file *file)
3172 {
3173 	return seq_open_net(inode, file, &ptype_seq_ops,
3174 			sizeof(struct seq_net_private));
3175 }
3176 
3177 static const struct file_operations ptype_seq_fops = {
3178 	.owner	 = THIS_MODULE,
3179 	.open    = ptype_seq_open,
3180 	.read    = seq_read,
3181 	.llseek  = seq_lseek,
3182 	.release = seq_release_net,
3183 };
3184 
3185 
3186 static int __net_init dev_proc_net_init(struct net *net)
3187 {
3188 	int rc = -ENOMEM;
3189 
3190 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3191 		goto out;
3192 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3193 		goto out_dev;
3194 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3195 		goto out_softnet;
3196 
3197 	if (wext_proc_init(net))
3198 		goto out_ptype;
3199 	rc = 0;
3200 out:
3201 	return rc;
3202 out_ptype:
3203 	proc_net_remove(net, "ptype");
3204 out_softnet:
3205 	proc_net_remove(net, "softnet_stat");
3206 out_dev:
3207 	proc_net_remove(net, "dev");
3208 	goto out;
3209 }
3210 
3211 static void __net_exit dev_proc_net_exit(struct net *net)
3212 {
3213 	wext_proc_exit(net);
3214 
3215 	proc_net_remove(net, "ptype");
3216 	proc_net_remove(net, "softnet_stat");
3217 	proc_net_remove(net, "dev");
3218 }
3219 
3220 static struct pernet_operations __net_initdata dev_proc_ops = {
3221 	.init = dev_proc_net_init,
3222 	.exit = dev_proc_net_exit,
3223 };
3224 
3225 static int __init dev_proc_init(void)
3226 {
3227 	return register_pernet_subsys(&dev_proc_ops);
3228 }
3229 #else
3230 #define dev_proc_init() 0
3231 #endif	/* CONFIG_PROC_FS */
3232 
3233 
3234 /**
3235  *	netdev_set_master	-	set up master/slave pair
3236  *	@slave: slave device
3237  *	@master: new master device
3238  *
3239  *	Changes the master device of the slave. Pass %NULL to break the
3240  *	bonding. The caller must hold the RTNL semaphore. On a failure
3241  *	a negative errno code is returned. On success the reference counts
3242  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3243  *	function returns zero.
3244  */
3245 int netdev_set_master(struct net_device *slave, struct net_device *master)
3246 {
3247 	struct net_device *old = slave->master;
3248 
3249 	ASSERT_RTNL();
3250 
3251 	if (master) {
3252 		if (old)
3253 			return -EBUSY;
3254 		dev_hold(master);
3255 	}
3256 
3257 	slave->master = master;
3258 
3259 	synchronize_net();
3260 
3261 	if (old)
3262 		dev_put(old);
3263 
3264 	if (master)
3265 		slave->flags |= IFF_SLAVE;
3266 	else
3267 		slave->flags &= ~IFF_SLAVE;
3268 
3269 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3270 	return 0;
3271 }
3272 
3273 static void dev_change_rx_flags(struct net_device *dev, int flags)
3274 {
3275 	const struct net_device_ops *ops = dev->netdev_ops;
3276 
3277 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3278 		ops->ndo_change_rx_flags(dev, flags);
3279 }
3280 
3281 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3282 {
3283 	unsigned short old_flags = dev->flags;
3284 	uid_t uid;
3285 	gid_t gid;
3286 
3287 	ASSERT_RTNL();
3288 
3289 	dev->flags |= IFF_PROMISC;
3290 	dev->promiscuity += inc;
3291 	if (dev->promiscuity == 0) {
3292 		/*
3293 		 * Avoid overflow.
3294 		 * If inc causes overflow, untouch promisc and return error.
3295 		 */
3296 		if (inc < 0)
3297 			dev->flags &= ~IFF_PROMISC;
3298 		else {
3299 			dev->promiscuity -= inc;
3300 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3301 				"set promiscuity failed, promiscuity feature "
3302 				"of device might be broken.\n", dev->name);
3303 			return -EOVERFLOW;
3304 		}
3305 	}
3306 	if (dev->flags != old_flags) {
3307 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3308 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3309 							       "left");
3310 		if (audit_enabled) {
3311 			current_uid_gid(&uid, &gid);
3312 			audit_log(current->audit_context, GFP_ATOMIC,
3313 				AUDIT_ANOM_PROMISCUOUS,
3314 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3315 				dev->name, (dev->flags & IFF_PROMISC),
3316 				(old_flags & IFF_PROMISC),
3317 				audit_get_loginuid(current),
3318 				uid, gid,
3319 				audit_get_sessionid(current));
3320 		}
3321 
3322 		dev_change_rx_flags(dev, IFF_PROMISC);
3323 	}
3324 	return 0;
3325 }
3326 
3327 /**
3328  *	dev_set_promiscuity	- update promiscuity count on a device
3329  *	@dev: device
3330  *	@inc: modifier
3331  *
3332  *	Add or remove promiscuity from a device. While the count in the device
3333  *	remains above zero the interface remains promiscuous. Once it hits zero
3334  *	the device reverts back to normal filtering operation. A negative inc
3335  *	value is used to drop promiscuity on the device.
3336  *	Return 0 if successful or a negative errno code on error.
3337  */
3338 int dev_set_promiscuity(struct net_device *dev, int inc)
3339 {
3340 	unsigned short old_flags = dev->flags;
3341 	int err;
3342 
3343 	err = __dev_set_promiscuity(dev, inc);
3344 	if (err < 0)
3345 		return err;
3346 	if (dev->flags != old_flags)
3347 		dev_set_rx_mode(dev);
3348 	return err;
3349 }
3350 
3351 /**
3352  *	dev_set_allmulti	- update allmulti count on a device
3353  *	@dev: device
3354  *	@inc: modifier
3355  *
3356  *	Add or remove reception of all multicast frames to a device. While the
3357  *	count in the device remains above zero the interface remains listening
3358  *	to all interfaces. Once it hits zero the device reverts back to normal
3359  *	filtering operation. A negative @inc value is used to drop the counter
3360  *	when releasing a resource needing all multicasts.
3361  *	Return 0 if successful or a negative errno code on error.
3362  */
3363 
3364 int dev_set_allmulti(struct net_device *dev, int inc)
3365 {
3366 	unsigned short old_flags = dev->flags;
3367 
3368 	ASSERT_RTNL();
3369 
3370 	dev->flags |= IFF_ALLMULTI;
3371 	dev->allmulti += inc;
3372 	if (dev->allmulti == 0) {
3373 		/*
3374 		 * Avoid overflow.
3375 		 * If inc causes overflow, untouch allmulti and return error.
3376 		 */
3377 		if (inc < 0)
3378 			dev->flags &= ~IFF_ALLMULTI;
3379 		else {
3380 			dev->allmulti -= inc;
3381 			printk(KERN_WARNING "%s: allmulti touches roof, "
3382 				"set allmulti failed, allmulti feature of "
3383 				"device might be broken.\n", dev->name);
3384 			return -EOVERFLOW;
3385 		}
3386 	}
3387 	if (dev->flags ^ old_flags) {
3388 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3389 		dev_set_rx_mode(dev);
3390 	}
3391 	return 0;
3392 }
3393 
3394 /*
3395  *	Upload unicast and multicast address lists to device and
3396  *	configure RX filtering. When the device doesn't support unicast
3397  *	filtering it is put in promiscuous mode while unicast addresses
3398  *	are present.
3399  */
3400 void __dev_set_rx_mode(struct net_device *dev)
3401 {
3402 	const struct net_device_ops *ops = dev->netdev_ops;
3403 
3404 	/* dev_open will call this function so the list will stay sane. */
3405 	if (!(dev->flags&IFF_UP))
3406 		return;
3407 
3408 	if (!netif_device_present(dev))
3409 		return;
3410 
3411 	if (ops->ndo_set_rx_mode)
3412 		ops->ndo_set_rx_mode(dev);
3413 	else {
3414 		/* Unicast addresses changes may only happen under the rtnl,
3415 		 * therefore calling __dev_set_promiscuity here is safe.
3416 		 */
3417 		if (dev->uc_count > 0 && !dev->uc_promisc) {
3418 			__dev_set_promiscuity(dev, 1);
3419 			dev->uc_promisc = 1;
3420 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3421 			__dev_set_promiscuity(dev, -1);
3422 			dev->uc_promisc = 0;
3423 		}
3424 
3425 		if (ops->ndo_set_multicast_list)
3426 			ops->ndo_set_multicast_list(dev);
3427 	}
3428 }
3429 
3430 void dev_set_rx_mode(struct net_device *dev)
3431 {
3432 	netif_addr_lock_bh(dev);
3433 	__dev_set_rx_mode(dev);
3434 	netif_addr_unlock_bh(dev);
3435 }
3436 
3437 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3438 		      void *addr, int alen, int glbl)
3439 {
3440 	struct dev_addr_list *da;
3441 
3442 	for (; (da = *list) != NULL; list = &da->next) {
3443 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3444 		    alen == da->da_addrlen) {
3445 			if (glbl) {
3446 				int old_glbl = da->da_gusers;
3447 				da->da_gusers = 0;
3448 				if (old_glbl == 0)
3449 					break;
3450 			}
3451 			if (--da->da_users)
3452 				return 0;
3453 
3454 			*list = da->next;
3455 			kfree(da);
3456 			(*count)--;
3457 			return 0;
3458 		}
3459 	}
3460 	return -ENOENT;
3461 }
3462 
3463 int __dev_addr_add(struct dev_addr_list **list, int *count,
3464 		   void *addr, int alen, int glbl)
3465 {
3466 	struct dev_addr_list *da;
3467 
3468 	for (da = *list; da != NULL; da = da->next) {
3469 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3470 		    da->da_addrlen == alen) {
3471 			if (glbl) {
3472 				int old_glbl = da->da_gusers;
3473 				da->da_gusers = 1;
3474 				if (old_glbl)
3475 					return 0;
3476 			}
3477 			da->da_users++;
3478 			return 0;
3479 		}
3480 	}
3481 
3482 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3483 	if (da == NULL)
3484 		return -ENOMEM;
3485 	memcpy(da->da_addr, addr, alen);
3486 	da->da_addrlen = alen;
3487 	da->da_users = 1;
3488 	da->da_gusers = glbl ? 1 : 0;
3489 	da->next = *list;
3490 	*list = da;
3491 	(*count)++;
3492 	return 0;
3493 }
3494 
3495 /**
3496  *	dev_unicast_delete	- Release secondary unicast address.
3497  *	@dev: device
3498  *	@addr: address to delete
3499  *	@alen: length of @addr
3500  *
3501  *	Release reference to a secondary unicast address and remove it
3502  *	from the device if the reference count drops to zero.
3503  *
3504  * 	The caller must hold the rtnl_mutex.
3505  */
3506 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3507 {
3508 	int err;
3509 
3510 	ASSERT_RTNL();
3511 
3512 	netif_addr_lock_bh(dev);
3513 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3514 	if (!err)
3515 		__dev_set_rx_mode(dev);
3516 	netif_addr_unlock_bh(dev);
3517 	return err;
3518 }
3519 EXPORT_SYMBOL(dev_unicast_delete);
3520 
3521 /**
3522  *	dev_unicast_add		- add a secondary unicast address
3523  *	@dev: device
3524  *	@addr: address to add
3525  *	@alen: length of @addr
3526  *
3527  *	Add a secondary unicast address to the device or increase
3528  *	the reference count if it already exists.
3529  *
3530  *	The caller must hold the rtnl_mutex.
3531  */
3532 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3533 {
3534 	int err;
3535 
3536 	ASSERT_RTNL();
3537 
3538 	netif_addr_lock_bh(dev);
3539 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3540 	if (!err)
3541 		__dev_set_rx_mode(dev);
3542 	netif_addr_unlock_bh(dev);
3543 	return err;
3544 }
3545 EXPORT_SYMBOL(dev_unicast_add);
3546 
3547 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3548 		    struct dev_addr_list **from, int *from_count)
3549 {
3550 	struct dev_addr_list *da, *next;
3551 	int err = 0;
3552 
3553 	da = *from;
3554 	while (da != NULL) {
3555 		next = da->next;
3556 		if (!da->da_synced) {
3557 			err = __dev_addr_add(to, to_count,
3558 					     da->da_addr, da->da_addrlen, 0);
3559 			if (err < 0)
3560 				break;
3561 			da->da_synced = 1;
3562 			da->da_users++;
3563 		} else if (da->da_users == 1) {
3564 			__dev_addr_delete(to, to_count,
3565 					  da->da_addr, da->da_addrlen, 0);
3566 			__dev_addr_delete(from, from_count,
3567 					  da->da_addr, da->da_addrlen, 0);
3568 		}
3569 		da = next;
3570 	}
3571 	return err;
3572 }
3573 
3574 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3575 		       struct dev_addr_list **from, int *from_count)
3576 {
3577 	struct dev_addr_list *da, *next;
3578 
3579 	da = *from;
3580 	while (da != NULL) {
3581 		next = da->next;
3582 		if (da->da_synced) {
3583 			__dev_addr_delete(to, to_count,
3584 					  da->da_addr, da->da_addrlen, 0);
3585 			da->da_synced = 0;
3586 			__dev_addr_delete(from, from_count,
3587 					  da->da_addr, da->da_addrlen, 0);
3588 		}
3589 		da = next;
3590 	}
3591 }
3592 
3593 /**
3594  *	dev_unicast_sync - Synchronize device's unicast list to another device
3595  *	@to: destination device
3596  *	@from: source device
3597  *
3598  *	Add newly added addresses to the destination device and release
3599  *	addresses that have no users left. The source device must be
3600  *	locked by netif_tx_lock_bh.
3601  *
3602  *	This function is intended to be called from the dev->set_rx_mode
3603  *	function of layered software devices.
3604  */
3605 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3606 {
3607 	int err = 0;
3608 
3609 	netif_addr_lock_bh(to);
3610 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3611 			      &from->uc_list, &from->uc_count);
3612 	if (!err)
3613 		__dev_set_rx_mode(to);
3614 	netif_addr_unlock_bh(to);
3615 	return err;
3616 }
3617 EXPORT_SYMBOL(dev_unicast_sync);
3618 
3619 /**
3620  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3621  *	@to: destination device
3622  *	@from: source device
3623  *
3624  *	Remove all addresses that were added to the destination device by
3625  *	dev_unicast_sync(). This function is intended to be called from the
3626  *	dev->stop function of layered software devices.
3627  */
3628 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3629 {
3630 	netif_addr_lock_bh(from);
3631 	netif_addr_lock(to);
3632 
3633 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3634 			  &from->uc_list, &from->uc_count);
3635 	__dev_set_rx_mode(to);
3636 
3637 	netif_addr_unlock(to);
3638 	netif_addr_unlock_bh(from);
3639 }
3640 EXPORT_SYMBOL(dev_unicast_unsync);
3641 
3642 static void __dev_addr_discard(struct dev_addr_list **list)
3643 {
3644 	struct dev_addr_list *tmp;
3645 
3646 	while (*list != NULL) {
3647 		tmp = *list;
3648 		*list = tmp->next;
3649 		if (tmp->da_users > tmp->da_gusers)
3650 			printk("__dev_addr_discard: address leakage! "
3651 			       "da_users=%d\n", tmp->da_users);
3652 		kfree(tmp);
3653 	}
3654 }
3655 
3656 static void dev_addr_discard(struct net_device *dev)
3657 {
3658 	netif_addr_lock_bh(dev);
3659 
3660 	__dev_addr_discard(&dev->uc_list);
3661 	dev->uc_count = 0;
3662 
3663 	__dev_addr_discard(&dev->mc_list);
3664 	dev->mc_count = 0;
3665 
3666 	netif_addr_unlock_bh(dev);
3667 }
3668 
3669 /**
3670  *	dev_get_flags - get flags reported to userspace
3671  *	@dev: device
3672  *
3673  *	Get the combination of flag bits exported through APIs to userspace.
3674  */
3675 unsigned dev_get_flags(const struct net_device *dev)
3676 {
3677 	unsigned flags;
3678 
3679 	flags = (dev->flags & ~(IFF_PROMISC |
3680 				IFF_ALLMULTI |
3681 				IFF_RUNNING |
3682 				IFF_LOWER_UP |
3683 				IFF_DORMANT)) |
3684 		(dev->gflags & (IFF_PROMISC |
3685 				IFF_ALLMULTI));
3686 
3687 	if (netif_running(dev)) {
3688 		if (netif_oper_up(dev))
3689 			flags |= IFF_RUNNING;
3690 		if (netif_carrier_ok(dev))
3691 			flags |= IFF_LOWER_UP;
3692 		if (netif_dormant(dev))
3693 			flags |= IFF_DORMANT;
3694 	}
3695 
3696 	return flags;
3697 }
3698 
3699 /**
3700  *	dev_change_flags - change device settings
3701  *	@dev: device
3702  *	@flags: device state flags
3703  *
3704  *	Change settings on device based state flags. The flags are
3705  *	in the userspace exported format.
3706  */
3707 int dev_change_flags(struct net_device *dev, unsigned flags)
3708 {
3709 	int ret, changes;
3710 	int old_flags = dev->flags;
3711 
3712 	ASSERT_RTNL();
3713 
3714 	/*
3715 	 *	Set the flags on our device.
3716 	 */
3717 
3718 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3719 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3720 			       IFF_AUTOMEDIA)) |
3721 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3722 				    IFF_ALLMULTI));
3723 
3724 	/*
3725 	 *	Load in the correct multicast list now the flags have changed.
3726 	 */
3727 
3728 	if ((old_flags ^ flags) & IFF_MULTICAST)
3729 		dev_change_rx_flags(dev, IFF_MULTICAST);
3730 
3731 	dev_set_rx_mode(dev);
3732 
3733 	/*
3734 	 *	Have we downed the interface. We handle IFF_UP ourselves
3735 	 *	according to user attempts to set it, rather than blindly
3736 	 *	setting it.
3737 	 */
3738 
3739 	ret = 0;
3740 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3741 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3742 
3743 		if (!ret)
3744 			dev_set_rx_mode(dev);
3745 	}
3746 
3747 	if (dev->flags & IFF_UP &&
3748 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3749 					  IFF_VOLATILE)))
3750 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3751 
3752 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3753 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3754 		dev->gflags ^= IFF_PROMISC;
3755 		dev_set_promiscuity(dev, inc);
3756 	}
3757 
3758 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3759 	   is important. Some (broken) drivers set IFF_PROMISC, when
3760 	   IFF_ALLMULTI is requested not asking us and not reporting.
3761 	 */
3762 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3763 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3764 		dev->gflags ^= IFF_ALLMULTI;
3765 		dev_set_allmulti(dev, inc);
3766 	}
3767 
3768 	/* Exclude state transition flags, already notified */
3769 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3770 	if (changes)
3771 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3772 
3773 	return ret;
3774 }
3775 
3776 /**
3777  *	dev_set_mtu - Change maximum transfer unit
3778  *	@dev: device
3779  *	@new_mtu: new transfer unit
3780  *
3781  *	Change the maximum transfer size of the network device.
3782  */
3783 int dev_set_mtu(struct net_device *dev, int new_mtu)
3784 {
3785 	const struct net_device_ops *ops = dev->netdev_ops;
3786 	int err;
3787 
3788 	if (new_mtu == dev->mtu)
3789 		return 0;
3790 
3791 	/*	MTU must be positive.	 */
3792 	if (new_mtu < 0)
3793 		return -EINVAL;
3794 
3795 	if (!netif_device_present(dev))
3796 		return -ENODEV;
3797 
3798 	err = 0;
3799 	if (ops->ndo_change_mtu)
3800 		err = ops->ndo_change_mtu(dev, new_mtu);
3801 	else
3802 		dev->mtu = new_mtu;
3803 
3804 	if (!err && dev->flags & IFF_UP)
3805 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3806 	return err;
3807 }
3808 
3809 /**
3810  *	dev_set_mac_address - Change Media Access Control Address
3811  *	@dev: device
3812  *	@sa: new address
3813  *
3814  *	Change the hardware (MAC) address of the device
3815  */
3816 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3817 {
3818 	const struct net_device_ops *ops = dev->netdev_ops;
3819 	int err;
3820 
3821 	if (!ops->ndo_set_mac_address)
3822 		return -EOPNOTSUPP;
3823 	if (sa->sa_family != dev->type)
3824 		return -EINVAL;
3825 	if (!netif_device_present(dev))
3826 		return -ENODEV;
3827 	err = ops->ndo_set_mac_address(dev, sa);
3828 	if (!err)
3829 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3830 	return err;
3831 }
3832 
3833 /*
3834  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3835  */
3836 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3837 {
3838 	int err;
3839 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3840 
3841 	if (!dev)
3842 		return -ENODEV;
3843 
3844 	switch (cmd) {
3845 		case SIOCGIFFLAGS:	/* Get interface flags */
3846 			ifr->ifr_flags = dev_get_flags(dev);
3847 			return 0;
3848 
3849 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3850 					   (currently unused) */
3851 			ifr->ifr_metric = 0;
3852 			return 0;
3853 
3854 		case SIOCGIFMTU:	/* Get the MTU of a device */
3855 			ifr->ifr_mtu = dev->mtu;
3856 			return 0;
3857 
3858 		case SIOCGIFHWADDR:
3859 			if (!dev->addr_len)
3860 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3861 			else
3862 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3863 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3864 			ifr->ifr_hwaddr.sa_family = dev->type;
3865 			return 0;
3866 
3867 		case SIOCGIFSLAVE:
3868 			err = -EINVAL;
3869 			break;
3870 
3871 		case SIOCGIFMAP:
3872 			ifr->ifr_map.mem_start = dev->mem_start;
3873 			ifr->ifr_map.mem_end   = dev->mem_end;
3874 			ifr->ifr_map.base_addr = dev->base_addr;
3875 			ifr->ifr_map.irq       = dev->irq;
3876 			ifr->ifr_map.dma       = dev->dma;
3877 			ifr->ifr_map.port      = dev->if_port;
3878 			return 0;
3879 
3880 		case SIOCGIFINDEX:
3881 			ifr->ifr_ifindex = dev->ifindex;
3882 			return 0;
3883 
3884 		case SIOCGIFTXQLEN:
3885 			ifr->ifr_qlen = dev->tx_queue_len;
3886 			return 0;
3887 
3888 		default:
3889 			/* dev_ioctl() should ensure this case
3890 			 * is never reached
3891 			 */
3892 			WARN_ON(1);
3893 			err = -EINVAL;
3894 			break;
3895 
3896 	}
3897 	return err;
3898 }
3899 
3900 /*
3901  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3902  */
3903 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3904 {
3905 	int err;
3906 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3907 	const struct net_device_ops *ops;
3908 
3909 	if (!dev)
3910 		return -ENODEV;
3911 
3912 	ops = dev->netdev_ops;
3913 
3914 	switch (cmd) {
3915 		case SIOCSIFFLAGS:	/* Set interface flags */
3916 			return dev_change_flags(dev, ifr->ifr_flags);
3917 
3918 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3919 					   (currently unused) */
3920 			return -EOPNOTSUPP;
3921 
3922 		case SIOCSIFMTU:	/* Set the MTU of a device */
3923 			return dev_set_mtu(dev, ifr->ifr_mtu);
3924 
3925 		case SIOCSIFHWADDR:
3926 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3927 
3928 		case SIOCSIFHWBROADCAST:
3929 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3930 				return -EINVAL;
3931 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3932 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3933 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3934 			return 0;
3935 
3936 		case SIOCSIFMAP:
3937 			if (ops->ndo_set_config) {
3938 				if (!netif_device_present(dev))
3939 					return -ENODEV;
3940 				return ops->ndo_set_config(dev, &ifr->ifr_map);
3941 			}
3942 			return -EOPNOTSUPP;
3943 
3944 		case SIOCADDMULTI:
3945 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3946 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3947 				return -EINVAL;
3948 			if (!netif_device_present(dev))
3949 				return -ENODEV;
3950 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3951 					  dev->addr_len, 1);
3952 
3953 		case SIOCDELMULTI:
3954 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3955 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3956 				return -EINVAL;
3957 			if (!netif_device_present(dev))
3958 				return -ENODEV;
3959 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3960 					     dev->addr_len, 1);
3961 
3962 		case SIOCSIFTXQLEN:
3963 			if (ifr->ifr_qlen < 0)
3964 				return -EINVAL;
3965 			dev->tx_queue_len = ifr->ifr_qlen;
3966 			return 0;
3967 
3968 		case SIOCSIFNAME:
3969 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3970 			return dev_change_name(dev, ifr->ifr_newname);
3971 
3972 		/*
3973 		 *	Unknown or private ioctl
3974 		 */
3975 
3976 		default:
3977 			if ((cmd >= SIOCDEVPRIVATE &&
3978 			    cmd <= SIOCDEVPRIVATE + 15) ||
3979 			    cmd == SIOCBONDENSLAVE ||
3980 			    cmd == SIOCBONDRELEASE ||
3981 			    cmd == SIOCBONDSETHWADDR ||
3982 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3983 			    cmd == SIOCBONDINFOQUERY ||
3984 			    cmd == SIOCBONDCHANGEACTIVE ||
3985 			    cmd == SIOCGMIIPHY ||
3986 			    cmd == SIOCGMIIREG ||
3987 			    cmd == SIOCSMIIREG ||
3988 			    cmd == SIOCBRADDIF ||
3989 			    cmd == SIOCBRDELIF ||
3990 			    cmd == SIOCSHWTSTAMP ||
3991 			    cmd == SIOCWANDEV) {
3992 				err = -EOPNOTSUPP;
3993 				if (ops->ndo_do_ioctl) {
3994 					if (netif_device_present(dev))
3995 						err = ops->ndo_do_ioctl(dev, ifr, cmd);
3996 					else
3997 						err = -ENODEV;
3998 				}
3999 			} else
4000 				err = -EINVAL;
4001 
4002 	}
4003 	return err;
4004 }
4005 
4006 /*
4007  *	This function handles all "interface"-type I/O control requests. The actual
4008  *	'doing' part of this is dev_ifsioc above.
4009  */
4010 
4011 /**
4012  *	dev_ioctl	-	network device ioctl
4013  *	@net: the applicable net namespace
4014  *	@cmd: command to issue
4015  *	@arg: pointer to a struct ifreq in user space
4016  *
4017  *	Issue ioctl functions to devices. This is normally called by the
4018  *	user space syscall interfaces but can sometimes be useful for
4019  *	other purposes. The return value is the return from the syscall if
4020  *	positive or a negative errno code on error.
4021  */
4022 
4023 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4024 {
4025 	struct ifreq ifr;
4026 	int ret;
4027 	char *colon;
4028 
4029 	/* One special case: SIOCGIFCONF takes ifconf argument
4030 	   and requires shared lock, because it sleeps writing
4031 	   to user space.
4032 	 */
4033 
4034 	if (cmd == SIOCGIFCONF) {
4035 		rtnl_lock();
4036 		ret = dev_ifconf(net, (char __user *) arg);
4037 		rtnl_unlock();
4038 		return ret;
4039 	}
4040 	if (cmd == SIOCGIFNAME)
4041 		return dev_ifname(net, (struct ifreq __user *)arg);
4042 
4043 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4044 		return -EFAULT;
4045 
4046 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4047 
4048 	colon = strchr(ifr.ifr_name, ':');
4049 	if (colon)
4050 		*colon = 0;
4051 
4052 	/*
4053 	 *	See which interface the caller is talking about.
4054 	 */
4055 
4056 	switch (cmd) {
4057 		/*
4058 		 *	These ioctl calls:
4059 		 *	- can be done by all.
4060 		 *	- atomic and do not require locking.
4061 		 *	- return a value
4062 		 */
4063 		case SIOCGIFFLAGS:
4064 		case SIOCGIFMETRIC:
4065 		case SIOCGIFMTU:
4066 		case SIOCGIFHWADDR:
4067 		case SIOCGIFSLAVE:
4068 		case SIOCGIFMAP:
4069 		case SIOCGIFINDEX:
4070 		case SIOCGIFTXQLEN:
4071 			dev_load(net, ifr.ifr_name);
4072 			read_lock(&dev_base_lock);
4073 			ret = dev_ifsioc_locked(net, &ifr, cmd);
4074 			read_unlock(&dev_base_lock);
4075 			if (!ret) {
4076 				if (colon)
4077 					*colon = ':';
4078 				if (copy_to_user(arg, &ifr,
4079 						 sizeof(struct ifreq)))
4080 					ret = -EFAULT;
4081 			}
4082 			return ret;
4083 
4084 		case SIOCETHTOOL:
4085 			dev_load(net, ifr.ifr_name);
4086 			rtnl_lock();
4087 			ret = dev_ethtool(net, &ifr);
4088 			rtnl_unlock();
4089 			if (!ret) {
4090 				if (colon)
4091 					*colon = ':';
4092 				if (copy_to_user(arg, &ifr,
4093 						 sizeof(struct ifreq)))
4094 					ret = -EFAULT;
4095 			}
4096 			return ret;
4097 
4098 		/*
4099 		 *	These ioctl calls:
4100 		 *	- require superuser power.
4101 		 *	- require strict serialization.
4102 		 *	- return a value
4103 		 */
4104 		case SIOCGMIIPHY:
4105 		case SIOCGMIIREG:
4106 		case SIOCSIFNAME:
4107 			if (!capable(CAP_NET_ADMIN))
4108 				return -EPERM;
4109 			dev_load(net, ifr.ifr_name);
4110 			rtnl_lock();
4111 			ret = dev_ifsioc(net, &ifr, cmd);
4112 			rtnl_unlock();
4113 			if (!ret) {
4114 				if (colon)
4115 					*colon = ':';
4116 				if (copy_to_user(arg, &ifr,
4117 						 sizeof(struct ifreq)))
4118 					ret = -EFAULT;
4119 			}
4120 			return ret;
4121 
4122 		/*
4123 		 *	These ioctl calls:
4124 		 *	- require superuser power.
4125 		 *	- require strict serialization.
4126 		 *	- do not return a value
4127 		 */
4128 		case SIOCSIFFLAGS:
4129 		case SIOCSIFMETRIC:
4130 		case SIOCSIFMTU:
4131 		case SIOCSIFMAP:
4132 		case SIOCSIFHWADDR:
4133 		case SIOCSIFSLAVE:
4134 		case SIOCADDMULTI:
4135 		case SIOCDELMULTI:
4136 		case SIOCSIFHWBROADCAST:
4137 		case SIOCSIFTXQLEN:
4138 		case SIOCSMIIREG:
4139 		case SIOCBONDENSLAVE:
4140 		case SIOCBONDRELEASE:
4141 		case SIOCBONDSETHWADDR:
4142 		case SIOCBONDCHANGEACTIVE:
4143 		case SIOCBRADDIF:
4144 		case SIOCBRDELIF:
4145 		case SIOCSHWTSTAMP:
4146 			if (!capable(CAP_NET_ADMIN))
4147 				return -EPERM;
4148 			/* fall through */
4149 		case SIOCBONDSLAVEINFOQUERY:
4150 		case SIOCBONDINFOQUERY:
4151 			dev_load(net, ifr.ifr_name);
4152 			rtnl_lock();
4153 			ret = dev_ifsioc(net, &ifr, cmd);
4154 			rtnl_unlock();
4155 			return ret;
4156 
4157 		case SIOCGIFMEM:
4158 			/* Get the per device memory space. We can add this but
4159 			 * currently do not support it */
4160 		case SIOCSIFMEM:
4161 			/* Set the per device memory buffer space.
4162 			 * Not applicable in our case */
4163 		case SIOCSIFLINK:
4164 			return -EINVAL;
4165 
4166 		/*
4167 		 *	Unknown or private ioctl.
4168 		 */
4169 		default:
4170 			if (cmd == SIOCWANDEV ||
4171 			    (cmd >= SIOCDEVPRIVATE &&
4172 			     cmd <= SIOCDEVPRIVATE + 15)) {
4173 				dev_load(net, ifr.ifr_name);
4174 				rtnl_lock();
4175 				ret = dev_ifsioc(net, &ifr, cmd);
4176 				rtnl_unlock();
4177 				if (!ret && copy_to_user(arg, &ifr,
4178 							 sizeof(struct ifreq)))
4179 					ret = -EFAULT;
4180 				return ret;
4181 			}
4182 			/* Take care of Wireless Extensions */
4183 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4184 				return wext_handle_ioctl(net, &ifr, cmd, arg);
4185 			return -EINVAL;
4186 	}
4187 }
4188 
4189 
4190 /**
4191  *	dev_new_index	-	allocate an ifindex
4192  *	@net: the applicable net namespace
4193  *
4194  *	Returns a suitable unique value for a new device interface
4195  *	number.  The caller must hold the rtnl semaphore or the
4196  *	dev_base_lock to be sure it remains unique.
4197  */
4198 static int dev_new_index(struct net *net)
4199 {
4200 	static int ifindex;
4201 	for (;;) {
4202 		if (++ifindex <= 0)
4203 			ifindex = 1;
4204 		if (!__dev_get_by_index(net, ifindex))
4205 			return ifindex;
4206 	}
4207 }
4208 
4209 /* Delayed registration/unregisteration */
4210 static LIST_HEAD(net_todo_list);
4211 
4212 static void net_set_todo(struct net_device *dev)
4213 {
4214 	list_add_tail(&dev->todo_list, &net_todo_list);
4215 }
4216 
4217 static void rollback_registered(struct net_device *dev)
4218 {
4219 	BUG_ON(dev_boot_phase);
4220 	ASSERT_RTNL();
4221 
4222 	/* Some devices call without registering for initialization unwind. */
4223 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4224 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4225 				  "was registered\n", dev->name, dev);
4226 
4227 		WARN_ON(1);
4228 		return;
4229 	}
4230 
4231 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4232 
4233 	/* If device is running, close it first. */
4234 	dev_close(dev);
4235 
4236 	/* And unlink it from device chain. */
4237 	unlist_netdevice(dev);
4238 
4239 	dev->reg_state = NETREG_UNREGISTERING;
4240 
4241 	synchronize_net();
4242 
4243 	/* Shutdown queueing discipline. */
4244 	dev_shutdown(dev);
4245 
4246 
4247 	/* Notify protocols, that we are about to destroy
4248 	   this device. They should clean all the things.
4249 	*/
4250 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4251 
4252 	/*
4253 	 *	Flush the unicast and multicast chains
4254 	 */
4255 	dev_addr_discard(dev);
4256 
4257 	if (dev->netdev_ops->ndo_uninit)
4258 		dev->netdev_ops->ndo_uninit(dev);
4259 
4260 	/* Notifier chain MUST detach us from master device. */
4261 	WARN_ON(dev->master);
4262 
4263 	/* Remove entries from kobject tree */
4264 	netdev_unregister_kobject(dev);
4265 
4266 	synchronize_net();
4267 
4268 	dev_put(dev);
4269 }
4270 
4271 static void __netdev_init_queue_locks_one(struct net_device *dev,
4272 					  struct netdev_queue *dev_queue,
4273 					  void *_unused)
4274 {
4275 	spin_lock_init(&dev_queue->_xmit_lock);
4276 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4277 	dev_queue->xmit_lock_owner = -1;
4278 }
4279 
4280 static void netdev_init_queue_locks(struct net_device *dev)
4281 {
4282 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4283 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4284 }
4285 
4286 unsigned long netdev_fix_features(unsigned long features, const char *name)
4287 {
4288 	/* Fix illegal SG+CSUM combinations. */
4289 	if ((features & NETIF_F_SG) &&
4290 	    !(features & NETIF_F_ALL_CSUM)) {
4291 		if (name)
4292 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4293 			       "checksum feature.\n", name);
4294 		features &= ~NETIF_F_SG;
4295 	}
4296 
4297 	/* TSO requires that SG is present as well. */
4298 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4299 		if (name)
4300 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4301 			       "SG feature.\n", name);
4302 		features &= ~NETIF_F_TSO;
4303 	}
4304 
4305 	if (features & NETIF_F_UFO) {
4306 		if (!(features & NETIF_F_GEN_CSUM)) {
4307 			if (name)
4308 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4309 				       "since no NETIF_F_HW_CSUM feature.\n",
4310 				       name);
4311 			features &= ~NETIF_F_UFO;
4312 		}
4313 
4314 		if (!(features & NETIF_F_SG)) {
4315 			if (name)
4316 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4317 				       "since no NETIF_F_SG feature.\n", name);
4318 			features &= ~NETIF_F_UFO;
4319 		}
4320 	}
4321 
4322 	return features;
4323 }
4324 EXPORT_SYMBOL(netdev_fix_features);
4325 
4326 /* Some devices need to (re-)set their netdev_ops inside
4327  * ->init() or similar.  If that happens, we have to setup
4328  * the compat pointers again.
4329  */
4330 void netdev_resync_ops(struct net_device *dev)
4331 {
4332 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4333 	const struct net_device_ops *ops = dev->netdev_ops;
4334 
4335 	dev->init = ops->ndo_init;
4336 	dev->uninit = ops->ndo_uninit;
4337 	dev->open = ops->ndo_open;
4338 	dev->change_rx_flags = ops->ndo_change_rx_flags;
4339 	dev->set_rx_mode = ops->ndo_set_rx_mode;
4340 	dev->set_multicast_list = ops->ndo_set_multicast_list;
4341 	dev->set_mac_address = ops->ndo_set_mac_address;
4342 	dev->validate_addr = ops->ndo_validate_addr;
4343 	dev->do_ioctl = ops->ndo_do_ioctl;
4344 	dev->set_config = ops->ndo_set_config;
4345 	dev->change_mtu = ops->ndo_change_mtu;
4346 	dev->neigh_setup = ops->ndo_neigh_setup;
4347 	dev->tx_timeout = ops->ndo_tx_timeout;
4348 	dev->get_stats = ops->ndo_get_stats;
4349 	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4350 	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4351 	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4352 #ifdef CONFIG_NET_POLL_CONTROLLER
4353 	dev->poll_controller = ops->ndo_poll_controller;
4354 #endif
4355 #endif
4356 }
4357 EXPORT_SYMBOL(netdev_resync_ops);
4358 
4359 /**
4360  *	register_netdevice	- register a network device
4361  *	@dev: device to register
4362  *
4363  *	Take a completed network device structure and add it to the kernel
4364  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4365  *	chain. 0 is returned on success. A negative errno code is returned
4366  *	on a failure to set up the device, or if the name is a duplicate.
4367  *
4368  *	Callers must hold the rtnl semaphore. You may want
4369  *	register_netdev() instead of this.
4370  *
4371  *	BUGS:
4372  *	The locking appears insufficient to guarantee two parallel registers
4373  *	will not get the same name.
4374  */
4375 
4376 int register_netdevice(struct net_device *dev)
4377 {
4378 	struct hlist_head *head;
4379 	struct hlist_node *p;
4380 	int ret;
4381 	struct net *net = dev_net(dev);
4382 
4383 	BUG_ON(dev_boot_phase);
4384 	ASSERT_RTNL();
4385 
4386 	might_sleep();
4387 
4388 	/* When net_device's are persistent, this will be fatal. */
4389 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4390 	BUG_ON(!net);
4391 
4392 	spin_lock_init(&dev->addr_list_lock);
4393 	netdev_set_addr_lockdep_class(dev);
4394 	netdev_init_queue_locks(dev);
4395 
4396 	dev->iflink = -1;
4397 
4398 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4399 	/* Netdevice_ops API compatiability support.
4400 	 * This is temporary until all network devices are converted.
4401 	 */
4402 	if (dev->netdev_ops) {
4403 		netdev_resync_ops(dev);
4404 	} else {
4405 		char drivername[64];
4406 		pr_info("%s (%s): not using net_device_ops yet\n",
4407 			dev->name, netdev_drivername(dev, drivername, 64));
4408 
4409 		/* This works only because net_device_ops and the
4410 		   compatiablity structure are the same. */
4411 		dev->netdev_ops = (void *) &(dev->init);
4412 	}
4413 #endif
4414 
4415 	/* Init, if this function is available */
4416 	if (dev->netdev_ops->ndo_init) {
4417 		ret = dev->netdev_ops->ndo_init(dev);
4418 		if (ret) {
4419 			if (ret > 0)
4420 				ret = -EIO;
4421 			goto out;
4422 		}
4423 	}
4424 
4425 	if (!dev_valid_name(dev->name)) {
4426 		ret = -EINVAL;
4427 		goto err_uninit;
4428 	}
4429 
4430 	dev->ifindex = dev_new_index(net);
4431 	if (dev->iflink == -1)
4432 		dev->iflink = dev->ifindex;
4433 
4434 	/* Check for existence of name */
4435 	head = dev_name_hash(net, dev->name);
4436 	hlist_for_each(p, head) {
4437 		struct net_device *d
4438 			= hlist_entry(p, struct net_device, name_hlist);
4439 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4440 			ret = -EEXIST;
4441 			goto err_uninit;
4442 		}
4443 	}
4444 
4445 	/* Fix illegal checksum combinations */
4446 	if ((dev->features & NETIF_F_HW_CSUM) &&
4447 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4448 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4449 		       dev->name);
4450 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4451 	}
4452 
4453 	if ((dev->features & NETIF_F_NO_CSUM) &&
4454 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4455 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4456 		       dev->name);
4457 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4458 	}
4459 
4460 	dev->features = netdev_fix_features(dev->features, dev->name);
4461 
4462 	/* Enable software GSO if SG is supported. */
4463 	if (dev->features & NETIF_F_SG)
4464 		dev->features |= NETIF_F_GSO;
4465 
4466 	netdev_initialize_kobject(dev);
4467 	ret = netdev_register_kobject(dev);
4468 	if (ret)
4469 		goto err_uninit;
4470 	dev->reg_state = NETREG_REGISTERED;
4471 
4472 	/*
4473 	 *	Default initial state at registry is that the
4474 	 *	device is present.
4475 	 */
4476 
4477 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4478 
4479 	dev_init_scheduler(dev);
4480 	dev_hold(dev);
4481 	list_netdevice(dev);
4482 
4483 	/* Notify protocols, that a new device appeared. */
4484 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4485 	ret = notifier_to_errno(ret);
4486 	if (ret) {
4487 		rollback_registered(dev);
4488 		dev->reg_state = NETREG_UNREGISTERED;
4489 	}
4490 
4491 out:
4492 	return ret;
4493 
4494 err_uninit:
4495 	if (dev->netdev_ops->ndo_uninit)
4496 		dev->netdev_ops->ndo_uninit(dev);
4497 	goto out;
4498 }
4499 
4500 /**
4501  *	init_dummy_netdev	- init a dummy network device for NAPI
4502  *	@dev: device to init
4503  *
4504  *	This takes a network device structure and initialize the minimum
4505  *	amount of fields so it can be used to schedule NAPI polls without
4506  *	registering a full blown interface. This is to be used by drivers
4507  *	that need to tie several hardware interfaces to a single NAPI
4508  *	poll scheduler due to HW limitations.
4509  */
4510 int init_dummy_netdev(struct net_device *dev)
4511 {
4512 	/* Clear everything. Note we don't initialize spinlocks
4513 	 * are they aren't supposed to be taken by any of the
4514 	 * NAPI code and this dummy netdev is supposed to be
4515 	 * only ever used for NAPI polls
4516 	 */
4517 	memset(dev, 0, sizeof(struct net_device));
4518 
4519 	/* make sure we BUG if trying to hit standard
4520 	 * register/unregister code path
4521 	 */
4522 	dev->reg_state = NETREG_DUMMY;
4523 
4524 	/* initialize the ref count */
4525 	atomic_set(&dev->refcnt, 1);
4526 
4527 	/* NAPI wants this */
4528 	INIT_LIST_HEAD(&dev->napi_list);
4529 
4530 	/* a dummy interface is started by default */
4531 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4532 	set_bit(__LINK_STATE_START, &dev->state);
4533 
4534 	return 0;
4535 }
4536 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4537 
4538 
4539 /**
4540  *	register_netdev	- register a network device
4541  *	@dev: device to register
4542  *
4543  *	Take a completed network device structure and add it to the kernel
4544  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4545  *	chain. 0 is returned on success. A negative errno code is returned
4546  *	on a failure to set up the device, or if the name is a duplicate.
4547  *
4548  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4549  *	and expands the device name if you passed a format string to
4550  *	alloc_netdev.
4551  */
4552 int register_netdev(struct net_device *dev)
4553 {
4554 	int err;
4555 
4556 	rtnl_lock();
4557 
4558 	/*
4559 	 * If the name is a format string the caller wants us to do a
4560 	 * name allocation.
4561 	 */
4562 	if (strchr(dev->name, '%')) {
4563 		err = dev_alloc_name(dev, dev->name);
4564 		if (err < 0)
4565 			goto out;
4566 	}
4567 
4568 	err = register_netdevice(dev);
4569 out:
4570 	rtnl_unlock();
4571 	return err;
4572 }
4573 EXPORT_SYMBOL(register_netdev);
4574 
4575 /*
4576  * netdev_wait_allrefs - wait until all references are gone.
4577  *
4578  * This is called when unregistering network devices.
4579  *
4580  * Any protocol or device that holds a reference should register
4581  * for netdevice notification, and cleanup and put back the
4582  * reference if they receive an UNREGISTER event.
4583  * We can get stuck here if buggy protocols don't correctly
4584  * call dev_put.
4585  */
4586 static void netdev_wait_allrefs(struct net_device *dev)
4587 {
4588 	unsigned long rebroadcast_time, warning_time;
4589 
4590 	rebroadcast_time = warning_time = jiffies;
4591 	while (atomic_read(&dev->refcnt) != 0) {
4592 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4593 			rtnl_lock();
4594 
4595 			/* Rebroadcast unregister notification */
4596 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4597 
4598 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4599 				     &dev->state)) {
4600 				/* We must not have linkwatch events
4601 				 * pending on unregister. If this
4602 				 * happens, we simply run the queue
4603 				 * unscheduled, resulting in a noop
4604 				 * for this device.
4605 				 */
4606 				linkwatch_run_queue();
4607 			}
4608 
4609 			__rtnl_unlock();
4610 
4611 			rebroadcast_time = jiffies;
4612 		}
4613 
4614 		msleep(250);
4615 
4616 		if (time_after(jiffies, warning_time + 10 * HZ)) {
4617 			printk(KERN_EMERG "unregister_netdevice: "
4618 			       "waiting for %s to become free. Usage "
4619 			       "count = %d\n",
4620 			       dev->name, atomic_read(&dev->refcnt));
4621 			warning_time = jiffies;
4622 		}
4623 	}
4624 }
4625 
4626 /* The sequence is:
4627  *
4628  *	rtnl_lock();
4629  *	...
4630  *	register_netdevice(x1);
4631  *	register_netdevice(x2);
4632  *	...
4633  *	unregister_netdevice(y1);
4634  *	unregister_netdevice(y2);
4635  *      ...
4636  *	rtnl_unlock();
4637  *	free_netdev(y1);
4638  *	free_netdev(y2);
4639  *
4640  * We are invoked by rtnl_unlock().
4641  * This allows us to deal with problems:
4642  * 1) We can delete sysfs objects which invoke hotplug
4643  *    without deadlocking with linkwatch via keventd.
4644  * 2) Since we run with the RTNL semaphore not held, we can sleep
4645  *    safely in order to wait for the netdev refcnt to drop to zero.
4646  *
4647  * We must not return until all unregister events added during
4648  * the interval the lock was held have been completed.
4649  */
4650 void netdev_run_todo(void)
4651 {
4652 	struct list_head list;
4653 
4654 	/* Snapshot list, allow later requests */
4655 	list_replace_init(&net_todo_list, &list);
4656 
4657 	__rtnl_unlock();
4658 
4659 	while (!list_empty(&list)) {
4660 		struct net_device *dev
4661 			= list_entry(list.next, struct net_device, todo_list);
4662 		list_del(&dev->todo_list);
4663 
4664 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4665 			printk(KERN_ERR "network todo '%s' but state %d\n",
4666 			       dev->name, dev->reg_state);
4667 			dump_stack();
4668 			continue;
4669 		}
4670 
4671 		dev->reg_state = NETREG_UNREGISTERED;
4672 
4673 		on_each_cpu(flush_backlog, dev, 1);
4674 
4675 		netdev_wait_allrefs(dev);
4676 
4677 		/* paranoia */
4678 		BUG_ON(atomic_read(&dev->refcnt));
4679 		WARN_ON(dev->ip_ptr);
4680 		WARN_ON(dev->ip6_ptr);
4681 		WARN_ON(dev->dn_ptr);
4682 
4683 		if (dev->destructor)
4684 			dev->destructor(dev);
4685 
4686 		/* Free network device */
4687 		kobject_put(&dev->dev.kobj);
4688 	}
4689 }
4690 
4691 /**
4692  *	dev_get_stats	- get network device statistics
4693  *	@dev: device to get statistics from
4694  *
4695  *	Get network statistics from device. The device driver may provide
4696  *	its own method by setting dev->netdev_ops->get_stats; otherwise
4697  *	the internal statistics structure is used.
4698  */
4699 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4700  {
4701 	const struct net_device_ops *ops = dev->netdev_ops;
4702 
4703 	if (ops->ndo_get_stats)
4704 		return ops->ndo_get_stats(dev);
4705 	else
4706 		return &dev->stats;
4707 }
4708 EXPORT_SYMBOL(dev_get_stats);
4709 
4710 static void netdev_init_one_queue(struct net_device *dev,
4711 				  struct netdev_queue *queue,
4712 				  void *_unused)
4713 {
4714 	queue->dev = dev;
4715 }
4716 
4717 static void netdev_init_queues(struct net_device *dev)
4718 {
4719 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4720 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4721 	spin_lock_init(&dev->tx_global_lock);
4722 }
4723 
4724 /**
4725  *	alloc_netdev_mq - allocate network device
4726  *	@sizeof_priv:	size of private data to allocate space for
4727  *	@name:		device name format string
4728  *	@setup:		callback to initialize device
4729  *	@queue_count:	the number of subqueues to allocate
4730  *
4731  *	Allocates a struct net_device with private data area for driver use
4732  *	and performs basic initialization.  Also allocates subquue structs
4733  *	for each queue on the device at the end of the netdevice.
4734  */
4735 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4736 		void (*setup)(struct net_device *), unsigned int queue_count)
4737 {
4738 	struct netdev_queue *tx;
4739 	struct net_device *dev;
4740 	size_t alloc_size;
4741 	void *p;
4742 
4743 	BUG_ON(strlen(name) >= sizeof(dev->name));
4744 
4745 	alloc_size = sizeof(struct net_device);
4746 	if (sizeof_priv) {
4747 		/* ensure 32-byte alignment of private area */
4748 		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4749 		alloc_size += sizeof_priv;
4750 	}
4751 	/* ensure 32-byte alignment of whole construct */
4752 	alloc_size += NETDEV_ALIGN_CONST;
4753 
4754 	p = kzalloc(alloc_size, GFP_KERNEL);
4755 	if (!p) {
4756 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4757 		return NULL;
4758 	}
4759 
4760 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4761 	if (!tx) {
4762 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4763 		       "tx qdiscs.\n");
4764 		kfree(p);
4765 		return NULL;
4766 	}
4767 
4768 	dev = (struct net_device *)
4769 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4770 	dev->padded = (char *)dev - (char *)p;
4771 	dev_net_set(dev, &init_net);
4772 
4773 	dev->_tx = tx;
4774 	dev->num_tx_queues = queue_count;
4775 	dev->real_num_tx_queues = queue_count;
4776 
4777 	dev->gso_max_size = GSO_MAX_SIZE;
4778 
4779 	netdev_init_queues(dev);
4780 
4781 	INIT_LIST_HEAD(&dev->napi_list);
4782 	setup(dev);
4783 	strcpy(dev->name, name);
4784 	return dev;
4785 }
4786 EXPORT_SYMBOL(alloc_netdev_mq);
4787 
4788 /**
4789  *	free_netdev - free network device
4790  *	@dev: device
4791  *
4792  *	This function does the last stage of destroying an allocated device
4793  * 	interface. The reference to the device object is released.
4794  *	If this is the last reference then it will be freed.
4795  */
4796 void free_netdev(struct net_device *dev)
4797 {
4798 	struct napi_struct *p, *n;
4799 
4800 	release_net(dev_net(dev));
4801 
4802 	kfree(dev->_tx);
4803 
4804 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4805 		netif_napi_del(p);
4806 
4807 	/*  Compatibility with error handling in drivers */
4808 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4809 		kfree((char *)dev - dev->padded);
4810 		return;
4811 	}
4812 
4813 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4814 	dev->reg_state = NETREG_RELEASED;
4815 
4816 	/* will free via device release */
4817 	put_device(&dev->dev);
4818 }
4819 
4820 /**
4821  *	synchronize_net -  Synchronize with packet receive processing
4822  *
4823  *	Wait for packets currently being received to be done.
4824  *	Does not block later packets from starting.
4825  */
4826 void synchronize_net(void)
4827 {
4828 	might_sleep();
4829 	synchronize_rcu();
4830 }
4831 
4832 /**
4833  *	unregister_netdevice - remove device from the kernel
4834  *	@dev: device
4835  *
4836  *	This function shuts down a device interface and removes it
4837  *	from the kernel tables.
4838  *
4839  *	Callers must hold the rtnl semaphore.  You may want
4840  *	unregister_netdev() instead of this.
4841  */
4842 
4843 void unregister_netdevice(struct net_device *dev)
4844 {
4845 	ASSERT_RTNL();
4846 
4847 	rollback_registered(dev);
4848 	/* Finish processing unregister after unlock */
4849 	net_set_todo(dev);
4850 }
4851 
4852 /**
4853  *	unregister_netdev - remove device from the kernel
4854  *	@dev: device
4855  *
4856  *	This function shuts down a device interface and removes it
4857  *	from the kernel tables.
4858  *
4859  *	This is just a wrapper for unregister_netdevice that takes
4860  *	the rtnl semaphore.  In general you want to use this and not
4861  *	unregister_netdevice.
4862  */
4863 void unregister_netdev(struct net_device *dev)
4864 {
4865 	rtnl_lock();
4866 	unregister_netdevice(dev);
4867 	rtnl_unlock();
4868 }
4869 
4870 EXPORT_SYMBOL(unregister_netdev);
4871 
4872 /**
4873  *	dev_change_net_namespace - move device to different nethost namespace
4874  *	@dev: device
4875  *	@net: network namespace
4876  *	@pat: If not NULL name pattern to try if the current device name
4877  *	      is already taken in the destination network namespace.
4878  *
4879  *	This function shuts down a device interface and moves it
4880  *	to a new network namespace. On success 0 is returned, on
4881  *	a failure a netagive errno code is returned.
4882  *
4883  *	Callers must hold the rtnl semaphore.
4884  */
4885 
4886 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4887 {
4888 	char buf[IFNAMSIZ];
4889 	const char *destname;
4890 	int err;
4891 
4892 	ASSERT_RTNL();
4893 
4894 	/* Don't allow namespace local devices to be moved. */
4895 	err = -EINVAL;
4896 	if (dev->features & NETIF_F_NETNS_LOCAL)
4897 		goto out;
4898 
4899 #ifdef CONFIG_SYSFS
4900 	/* Don't allow real devices to be moved when sysfs
4901 	 * is enabled.
4902 	 */
4903 	err = -EINVAL;
4904 	if (dev->dev.parent)
4905 		goto out;
4906 #endif
4907 
4908 	/* Ensure the device has been registrered */
4909 	err = -EINVAL;
4910 	if (dev->reg_state != NETREG_REGISTERED)
4911 		goto out;
4912 
4913 	/* Get out if there is nothing todo */
4914 	err = 0;
4915 	if (net_eq(dev_net(dev), net))
4916 		goto out;
4917 
4918 	/* Pick the destination device name, and ensure
4919 	 * we can use it in the destination network namespace.
4920 	 */
4921 	err = -EEXIST;
4922 	destname = dev->name;
4923 	if (__dev_get_by_name(net, destname)) {
4924 		/* We get here if we can't use the current device name */
4925 		if (!pat)
4926 			goto out;
4927 		if (!dev_valid_name(pat))
4928 			goto out;
4929 		if (strchr(pat, '%')) {
4930 			if (__dev_alloc_name(net, pat, buf) < 0)
4931 				goto out;
4932 			destname = buf;
4933 		} else
4934 			destname = pat;
4935 		if (__dev_get_by_name(net, destname))
4936 			goto out;
4937 	}
4938 
4939 	/*
4940 	 * And now a mini version of register_netdevice unregister_netdevice.
4941 	 */
4942 
4943 	/* If device is running close it first. */
4944 	dev_close(dev);
4945 
4946 	/* And unlink it from device chain */
4947 	err = -ENODEV;
4948 	unlist_netdevice(dev);
4949 
4950 	synchronize_net();
4951 
4952 	/* Shutdown queueing discipline. */
4953 	dev_shutdown(dev);
4954 
4955 	/* Notify protocols, that we are about to destroy
4956 	   this device. They should clean all the things.
4957 	*/
4958 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4959 
4960 	/*
4961 	 *	Flush the unicast and multicast chains
4962 	 */
4963 	dev_addr_discard(dev);
4964 
4965 	netdev_unregister_kobject(dev);
4966 
4967 	/* Actually switch the network namespace */
4968 	dev_net_set(dev, net);
4969 
4970 	/* Assign the new device name */
4971 	if (destname != dev->name)
4972 		strcpy(dev->name, destname);
4973 
4974 	/* If there is an ifindex conflict assign a new one */
4975 	if (__dev_get_by_index(net, dev->ifindex)) {
4976 		int iflink = (dev->iflink == dev->ifindex);
4977 		dev->ifindex = dev_new_index(net);
4978 		if (iflink)
4979 			dev->iflink = dev->ifindex;
4980 	}
4981 
4982 	/* Fixup kobjects */
4983 	err = netdev_register_kobject(dev);
4984 	WARN_ON(err);
4985 
4986 	/* Add the device back in the hashes */
4987 	list_netdevice(dev);
4988 
4989 	/* Notify protocols, that a new device appeared. */
4990 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4991 
4992 	synchronize_net();
4993 	err = 0;
4994 out:
4995 	return err;
4996 }
4997 
4998 static int dev_cpu_callback(struct notifier_block *nfb,
4999 			    unsigned long action,
5000 			    void *ocpu)
5001 {
5002 	struct sk_buff **list_skb;
5003 	struct Qdisc **list_net;
5004 	struct sk_buff *skb;
5005 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5006 	struct softnet_data *sd, *oldsd;
5007 
5008 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5009 		return NOTIFY_OK;
5010 
5011 	local_irq_disable();
5012 	cpu = smp_processor_id();
5013 	sd = &per_cpu(softnet_data, cpu);
5014 	oldsd = &per_cpu(softnet_data, oldcpu);
5015 
5016 	/* Find end of our completion_queue. */
5017 	list_skb = &sd->completion_queue;
5018 	while (*list_skb)
5019 		list_skb = &(*list_skb)->next;
5020 	/* Append completion queue from offline CPU. */
5021 	*list_skb = oldsd->completion_queue;
5022 	oldsd->completion_queue = NULL;
5023 
5024 	/* Find end of our output_queue. */
5025 	list_net = &sd->output_queue;
5026 	while (*list_net)
5027 		list_net = &(*list_net)->next_sched;
5028 	/* Append output queue from offline CPU. */
5029 	*list_net = oldsd->output_queue;
5030 	oldsd->output_queue = NULL;
5031 
5032 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5033 	local_irq_enable();
5034 
5035 	/* Process offline CPU's input_pkt_queue */
5036 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5037 		netif_rx(skb);
5038 
5039 	return NOTIFY_OK;
5040 }
5041 
5042 
5043 /**
5044  *	netdev_increment_features - increment feature set by one
5045  *	@all: current feature set
5046  *	@one: new feature set
5047  *	@mask: mask feature set
5048  *
5049  *	Computes a new feature set after adding a device with feature set
5050  *	@one to the master device with current feature set @all.  Will not
5051  *	enable anything that is off in @mask. Returns the new feature set.
5052  */
5053 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5054 					unsigned long mask)
5055 {
5056 	/* If device needs checksumming, downgrade to it. */
5057         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5058 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5059 	else if (mask & NETIF_F_ALL_CSUM) {
5060 		/* If one device supports v4/v6 checksumming, set for all. */
5061 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5062 		    !(all & NETIF_F_GEN_CSUM)) {
5063 			all &= ~NETIF_F_ALL_CSUM;
5064 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5065 		}
5066 
5067 		/* If one device supports hw checksumming, set for all. */
5068 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5069 			all &= ~NETIF_F_ALL_CSUM;
5070 			all |= NETIF_F_HW_CSUM;
5071 		}
5072 	}
5073 
5074 	one |= NETIF_F_ALL_CSUM;
5075 
5076 	one |= all & NETIF_F_ONE_FOR_ALL;
5077 	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5078 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5079 
5080 	return all;
5081 }
5082 EXPORT_SYMBOL(netdev_increment_features);
5083 
5084 static struct hlist_head *netdev_create_hash(void)
5085 {
5086 	int i;
5087 	struct hlist_head *hash;
5088 
5089 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5090 	if (hash != NULL)
5091 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5092 			INIT_HLIST_HEAD(&hash[i]);
5093 
5094 	return hash;
5095 }
5096 
5097 /* Initialize per network namespace state */
5098 static int __net_init netdev_init(struct net *net)
5099 {
5100 	INIT_LIST_HEAD(&net->dev_base_head);
5101 
5102 	net->dev_name_head = netdev_create_hash();
5103 	if (net->dev_name_head == NULL)
5104 		goto err_name;
5105 
5106 	net->dev_index_head = netdev_create_hash();
5107 	if (net->dev_index_head == NULL)
5108 		goto err_idx;
5109 
5110 	return 0;
5111 
5112 err_idx:
5113 	kfree(net->dev_name_head);
5114 err_name:
5115 	return -ENOMEM;
5116 }
5117 
5118 /**
5119  *	netdev_drivername - network driver for the device
5120  *	@dev: network device
5121  *	@buffer: buffer for resulting name
5122  *	@len: size of buffer
5123  *
5124  *	Determine network driver for device.
5125  */
5126 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5127 {
5128 	const struct device_driver *driver;
5129 	const struct device *parent;
5130 
5131 	if (len <= 0 || !buffer)
5132 		return buffer;
5133 	buffer[0] = 0;
5134 
5135 	parent = dev->dev.parent;
5136 
5137 	if (!parent)
5138 		return buffer;
5139 
5140 	driver = parent->driver;
5141 	if (driver && driver->name)
5142 		strlcpy(buffer, driver->name, len);
5143 	return buffer;
5144 }
5145 
5146 static void __net_exit netdev_exit(struct net *net)
5147 {
5148 	kfree(net->dev_name_head);
5149 	kfree(net->dev_index_head);
5150 }
5151 
5152 static struct pernet_operations __net_initdata netdev_net_ops = {
5153 	.init = netdev_init,
5154 	.exit = netdev_exit,
5155 };
5156 
5157 static void __net_exit default_device_exit(struct net *net)
5158 {
5159 	struct net_device *dev;
5160 	/*
5161 	 * Push all migratable of the network devices back to the
5162 	 * initial network namespace
5163 	 */
5164 	rtnl_lock();
5165 restart:
5166 	for_each_netdev(net, dev) {
5167 		int err;
5168 		char fb_name[IFNAMSIZ];
5169 
5170 		/* Ignore unmoveable devices (i.e. loopback) */
5171 		if (dev->features & NETIF_F_NETNS_LOCAL)
5172 			continue;
5173 
5174 		/* Delete virtual devices */
5175 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5176 			dev->rtnl_link_ops->dellink(dev);
5177 			goto restart;
5178 		}
5179 
5180 		/* Push remaing network devices to init_net */
5181 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5182 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5183 		if (err) {
5184 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5185 				__func__, dev->name, err);
5186 			BUG();
5187 		}
5188 		goto restart;
5189 	}
5190 	rtnl_unlock();
5191 }
5192 
5193 static struct pernet_operations __net_initdata default_device_ops = {
5194 	.exit = default_device_exit,
5195 };
5196 
5197 /*
5198  *	Initialize the DEV module. At boot time this walks the device list and
5199  *	unhooks any devices that fail to initialise (normally hardware not
5200  *	present) and leaves us with a valid list of present and active devices.
5201  *
5202  */
5203 
5204 /*
5205  *       This is called single threaded during boot, so no need
5206  *       to take the rtnl semaphore.
5207  */
5208 static int __init net_dev_init(void)
5209 {
5210 	int i, rc = -ENOMEM;
5211 
5212 	BUG_ON(!dev_boot_phase);
5213 
5214 	if (dev_proc_init())
5215 		goto out;
5216 
5217 	if (netdev_kobject_init())
5218 		goto out;
5219 
5220 	INIT_LIST_HEAD(&ptype_all);
5221 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5222 		INIT_LIST_HEAD(&ptype_base[i]);
5223 
5224 	if (register_pernet_subsys(&netdev_net_ops))
5225 		goto out;
5226 
5227 	/*
5228 	 *	Initialise the packet receive queues.
5229 	 */
5230 
5231 	for_each_possible_cpu(i) {
5232 		struct softnet_data *queue;
5233 
5234 		queue = &per_cpu(softnet_data, i);
5235 		skb_queue_head_init(&queue->input_pkt_queue);
5236 		queue->completion_queue = NULL;
5237 		INIT_LIST_HEAD(&queue->poll_list);
5238 
5239 		queue->backlog.poll = process_backlog;
5240 		queue->backlog.weight = weight_p;
5241 		queue->backlog.gro_list = NULL;
5242 		queue->backlog.gro_count = 0;
5243 	}
5244 
5245 	dev_boot_phase = 0;
5246 
5247 	/* The loopback device is special if any other network devices
5248 	 * is present in a network namespace the loopback device must
5249 	 * be present. Since we now dynamically allocate and free the
5250 	 * loopback device ensure this invariant is maintained by
5251 	 * keeping the loopback device as the first device on the
5252 	 * list of network devices.  Ensuring the loopback devices
5253 	 * is the first device that appears and the last network device
5254 	 * that disappears.
5255 	 */
5256 	if (register_pernet_device(&loopback_net_ops))
5257 		goto out;
5258 
5259 	if (register_pernet_device(&default_device_ops))
5260 		goto out;
5261 
5262 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5263 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5264 
5265 	hotcpu_notifier(dev_cpu_callback, 0);
5266 	dst_init();
5267 	dev_mcast_init();
5268 	rc = 0;
5269 out:
5270 	return rc;
5271 }
5272 
5273 subsys_initcall(net_dev_init);
5274 
5275 static int __init initialize_hashrnd(void)
5276 {
5277 	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5278 	return 0;
5279 }
5280 
5281 late_initcall_sync(initialize_hashrnd);
5282 
5283 EXPORT_SYMBOL(__dev_get_by_index);
5284 EXPORT_SYMBOL(__dev_get_by_name);
5285 EXPORT_SYMBOL(__dev_remove_pack);
5286 EXPORT_SYMBOL(dev_valid_name);
5287 EXPORT_SYMBOL(dev_add_pack);
5288 EXPORT_SYMBOL(dev_alloc_name);
5289 EXPORT_SYMBOL(dev_close);
5290 EXPORT_SYMBOL(dev_get_by_flags);
5291 EXPORT_SYMBOL(dev_get_by_index);
5292 EXPORT_SYMBOL(dev_get_by_name);
5293 EXPORT_SYMBOL(dev_open);
5294 EXPORT_SYMBOL(dev_queue_xmit);
5295 EXPORT_SYMBOL(dev_remove_pack);
5296 EXPORT_SYMBOL(dev_set_allmulti);
5297 EXPORT_SYMBOL(dev_set_promiscuity);
5298 EXPORT_SYMBOL(dev_change_flags);
5299 EXPORT_SYMBOL(dev_set_mtu);
5300 EXPORT_SYMBOL(dev_set_mac_address);
5301 EXPORT_SYMBOL(free_netdev);
5302 EXPORT_SYMBOL(netdev_boot_setup_check);
5303 EXPORT_SYMBOL(netdev_set_master);
5304 EXPORT_SYMBOL(netdev_state_change);
5305 EXPORT_SYMBOL(netif_receive_skb);
5306 EXPORT_SYMBOL(netif_rx);
5307 EXPORT_SYMBOL(register_gifconf);
5308 EXPORT_SYMBOL(register_netdevice);
5309 EXPORT_SYMBOL(register_netdevice_notifier);
5310 EXPORT_SYMBOL(skb_checksum_help);
5311 EXPORT_SYMBOL(synchronize_net);
5312 EXPORT_SYMBOL(unregister_netdevice);
5313 EXPORT_SYMBOL(unregister_netdevice_notifier);
5314 EXPORT_SYMBOL(net_enable_timestamp);
5315 EXPORT_SYMBOL(net_disable_timestamp);
5316 EXPORT_SYMBOL(dev_get_flags);
5317 
5318 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5319 EXPORT_SYMBOL(br_handle_frame_hook);
5320 EXPORT_SYMBOL(br_fdb_get_hook);
5321 EXPORT_SYMBOL(br_fdb_put_hook);
5322 #endif
5323 
5324 EXPORT_SYMBOL(dev_load);
5325 
5326 EXPORT_PER_CPU_SYMBOL(softnet_data);
5327