xref: /linux/net/core/dev.c (revision b233b28eac0cc37d07c2d007ea08c86c778c5af4)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/ethtool.h>
94 #include <linux/notifier.h>
95 #include <linux/skbuff.h>
96 #include <net/net_namespace.h>
97 #include <net/sock.h>
98 #include <linux/rtnetlink.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/stat.h>
102 #include <linux/if_bridge.h>
103 #include <linux/if_macvlan.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 
130 #include "net-sysfs.h"
131 
132 /* Instead of increasing this, you should create a hash table. */
133 #define MAX_GRO_SKBS 8
134 
135 /* This should be increased if a protocol with a bigger head is added. */
136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
137 
138 /*
139  *	The list of packet types we will receive (as opposed to discard)
140  *	and the routines to invoke.
141  *
142  *	Why 16. Because with 16 the only overlap we get on a hash of the
143  *	low nibble of the protocol value is RARP/SNAP/X.25.
144  *
145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
146  *             sure which should go first, but I bet it won't make much
147  *             difference if we are running VLANs.  The good news is that
148  *             this protocol won't be in the list unless compiled in, so
149  *             the average user (w/out VLANs) will not be adversely affected.
150  *             --BLG
151  *
152  *		0800	IP
153  *		8100    802.1Q VLAN
154  *		0001	802.3
155  *		0002	AX.25
156  *		0004	802.2
157  *		8035	RARP
158  *		0005	SNAP
159  *		0805	X.25
160  *		0806	ARP
161  *		8137	IPX
162  *		0009	Localtalk
163  *		86DD	IPv6
164  */
165 
166 #define PTYPE_HASH_SIZE	(16)
167 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
168 
169 static DEFINE_SPINLOCK(ptype_lock);
170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
171 static struct list_head ptype_all __read_mostly;	/* Taps */
172 
173 /*
174  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175  * semaphore.
176  *
177  * Pure readers hold dev_base_lock for reading.
178  *
179  * Writers must hold the rtnl semaphore while they loop through the
180  * dev_base_head list, and hold dev_base_lock for writing when they do the
181  * actual updates.  This allows pure readers to access the list even
182  * while a writer is preparing to update it.
183  *
184  * To put it another way, dev_base_lock is held for writing only to
185  * protect against pure readers; the rtnl semaphore provides the
186  * protection against other writers.
187  *
188  * See, for example usages, register_netdevice() and
189  * unregister_netdevice(), which must be called with the rtnl
190  * semaphore held.
191  */
192 DEFINE_RWLOCK(dev_base_lock);
193 
194 EXPORT_SYMBOL(dev_base_lock);
195 
196 #define NETDEV_HASHBITS	8
197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
198 
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
208 }
209 
210 /* Device list insertion */
211 static int list_netdevice(struct net_device *dev)
212 {
213 	struct net *net = dev_net(dev);
214 
215 	ASSERT_RTNL();
216 
217 	write_lock_bh(&dev_base_lock);
218 	list_add_tail(&dev->dev_list, &net->dev_base_head);
219 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
220 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
221 	write_unlock_bh(&dev_base_lock);
222 	return 0;
223 }
224 
225 /* Device list removal */
226 static void unlist_netdevice(struct net_device *dev)
227 {
228 	ASSERT_RTNL();
229 
230 	/* Unlink dev from the device chain */
231 	write_lock_bh(&dev_base_lock);
232 	list_del(&dev->dev_list);
233 	hlist_del(&dev->name_hlist);
234 	hlist_del(&dev->index_hlist);
235 	write_unlock_bh(&dev_base_lock);
236 }
237 
238 /*
239  *	Our notifier list
240  */
241 
242 static RAW_NOTIFIER_HEAD(netdev_chain);
243 
244 /*
245  *	Device drivers call our routines to queue packets here. We empty the
246  *	queue in the local softnet handler.
247  */
248 
249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
250 
251 #ifdef CONFIG_LOCKDEP
252 /*
253  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
254  * according to dev->type
255  */
256 static const unsigned short netdev_lock_type[] =
257 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
258 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
259 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
260 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
261 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
262 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
263 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
264 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
265 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
266 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
267 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
268 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
269 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
270 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
271 	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
272 
273 static const char *netdev_lock_name[] =
274 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
275 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
276 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
277 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
278 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
279 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
280 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
281 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
282 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
283 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
284 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
285 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
286 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
287 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
288 	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
289 
290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
292 
293 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
294 {
295 	int i;
296 
297 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
298 		if (netdev_lock_type[i] == dev_type)
299 			return i;
300 	/* the last key is used by default */
301 	return ARRAY_SIZE(netdev_lock_type) - 1;
302 }
303 
304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
305 						 unsigned short dev_type)
306 {
307 	int i;
308 
309 	i = netdev_lock_pos(dev_type);
310 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
311 				   netdev_lock_name[i]);
312 }
313 
314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
315 {
316 	int i;
317 
318 	i = netdev_lock_pos(dev->type);
319 	lockdep_set_class_and_name(&dev->addr_list_lock,
320 				   &netdev_addr_lock_key[i],
321 				   netdev_lock_name[i]);
322 }
323 #else
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 }
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330 }
331 #endif
332 
333 /*******************************************************************************
334 
335 		Protocol management and registration routines
336 
337 *******************************************************************************/
338 
339 /*
340  *	Add a protocol ID to the list. Now that the input handler is
341  *	smarter we can dispense with all the messy stuff that used to be
342  *	here.
343  *
344  *	BEWARE!!! Protocol handlers, mangling input packets,
345  *	MUST BE last in hash buckets and checking protocol handlers
346  *	MUST start from promiscuous ptype_all chain in net_bh.
347  *	It is true now, do not change it.
348  *	Explanation follows: if protocol handler, mangling packet, will
349  *	be the first on list, it is not able to sense, that packet
350  *	is cloned and should be copied-on-write, so that it will
351  *	change it and subsequent readers will get broken packet.
352  *							--ANK (980803)
353  */
354 
355 /**
356  *	dev_add_pack - add packet handler
357  *	@pt: packet type declaration
358  *
359  *	Add a protocol handler to the networking stack. The passed &packet_type
360  *	is linked into kernel lists and may not be freed until it has been
361  *	removed from the kernel lists.
362  *
363  *	This call does not sleep therefore it can not
364  *	guarantee all CPU's that are in middle of receiving packets
365  *	will see the new packet type (until the next received packet).
366  */
367 
368 void dev_add_pack(struct packet_type *pt)
369 {
370 	int hash;
371 
372 	spin_lock_bh(&ptype_lock);
373 	if (pt->type == htons(ETH_P_ALL))
374 		list_add_rcu(&pt->list, &ptype_all);
375 	else {
376 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
377 		list_add_rcu(&pt->list, &ptype_base[hash]);
378 	}
379 	spin_unlock_bh(&ptype_lock);
380 }
381 
382 /**
383  *	__dev_remove_pack	 - remove packet handler
384  *	@pt: packet type declaration
385  *
386  *	Remove a protocol handler that was previously added to the kernel
387  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
388  *	from the kernel lists and can be freed or reused once this function
389  *	returns.
390  *
391  *      The packet type might still be in use by receivers
392  *	and must not be freed until after all the CPU's have gone
393  *	through a quiescent state.
394  */
395 void __dev_remove_pack(struct packet_type *pt)
396 {
397 	struct list_head *head;
398 	struct packet_type *pt1;
399 
400 	spin_lock_bh(&ptype_lock);
401 
402 	if (pt->type == htons(ETH_P_ALL))
403 		head = &ptype_all;
404 	else
405 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
406 
407 	list_for_each_entry(pt1, head, list) {
408 		if (pt == pt1) {
409 			list_del_rcu(&pt->list);
410 			goto out;
411 		}
412 	}
413 
414 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
415 out:
416 	spin_unlock_bh(&ptype_lock);
417 }
418 /**
419  *	dev_remove_pack	 - remove packet handler
420  *	@pt: packet type declaration
421  *
422  *	Remove a protocol handler that was previously added to the kernel
423  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
424  *	from the kernel lists and can be freed or reused once this function
425  *	returns.
426  *
427  *	This call sleeps to guarantee that no CPU is looking at the packet
428  *	type after return.
429  */
430 void dev_remove_pack(struct packet_type *pt)
431 {
432 	__dev_remove_pack(pt);
433 
434 	synchronize_net();
435 }
436 
437 /******************************************************************************
438 
439 		      Device Boot-time Settings Routines
440 
441 *******************************************************************************/
442 
443 /* Boot time configuration table */
444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
445 
446 /**
447  *	netdev_boot_setup_add	- add new setup entry
448  *	@name: name of the device
449  *	@map: configured settings for the device
450  *
451  *	Adds new setup entry to the dev_boot_setup list.  The function
452  *	returns 0 on error and 1 on success.  This is a generic routine to
453  *	all netdevices.
454  */
455 static int netdev_boot_setup_add(char *name, struct ifmap *map)
456 {
457 	struct netdev_boot_setup *s;
458 	int i;
459 
460 	s = dev_boot_setup;
461 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
462 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
463 			memset(s[i].name, 0, sizeof(s[i].name));
464 			strlcpy(s[i].name, name, IFNAMSIZ);
465 			memcpy(&s[i].map, map, sizeof(s[i].map));
466 			break;
467 		}
468 	}
469 
470 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
471 }
472 
473 /**
474  *	netdev_boot_setup_check	- check boot time settings
475  *	@dev: the netdevice
476  *
477  * 	Check boot time settings for the device.
478  *	The found settings are set for the device to be used
479  *	later in the device probing.
480  *	Returns 0 if no settings found, 1 if they are.
481  */
482 int netdev_boot_setup_check(struct net_device *dev)
483 {
484 	struct netdev_boot_setup *s = dev_boot_setup;
485 	int i;
486 
487 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
488 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
489 		    !strcmp(dev->name, s[i].name)) {
490 			dev->irq 	= s[i].map.irq;
491 			dev->base_addr 	= s[i].map.base_addr;
492 			dev->mem_start 	= s[i].map.mem_start;
493 			dev->mem_end 	= s[i].map.mem_end;
494 			return 1;
495 		}
496 	}
497 	return 0;
498 }
499 
500 
501 /**
502  *	netdev_boot_base	- get address from boot time settings
503  *	@prefix: prefix for network device
504  *	@unit: id for network device
505  *
506  * 	Check boot time settings for the base address of device.
507  *	The found settings are set for the device to be used
508  *	later in the device probing.
509  *	Returns 0 if no settings found.
510  */
511 unsigned long netdev_boot_base(const char *prefix, int unit)
512 {
513 	const struct netdev_boot_setup *s = dev_boot_setup;
514 	char name[IFNAMSIZ];
515 	int i;
516 
517 	sprintf(name, "%s%d", prefix, unit);
518 
519 	/*
520 	 * If device already registered then return base of 1
521 	 * to indicate not to probe for this interface
522 	 */
523 	if (__dev_get_by_name(&init_net, name))
524 		return 1;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
527 		if (!strcmp(name, s[i].name))
528 			return s[i].map.base_addr;
529 	return 0;
530 }
531 
532 /*
533  * Saves at boot time configured settings for any netdevice.
534  */
535 int __init netdev_boot_setup(char *str)
536 {
537 	int ints[5];
538 	struct ifmap map;
539 
540 	str = get_options(str, ARRAY_SIZE(ints), ints);
541 	if (!str || !*str)
542 		return 0;
543 
544 	/* Save settings */
545 	memset(&map, 0, sizeof(map));
546 	if (ints[0] > 0)
547 		map.irq = ints[1];
548 	if (ints[0] > 1)
549 		map.base_addr = ints[2];
550 	if (ints[0] > 2)
551 		map.mem_start = ints[3];
552 	if (ints[0] > 3)
553 		map.mem_end = ints[4];
554 
555 	/* Add new entry to the list */
556 	return netdev_boot_setup_add(str, &map);
557 }
558 
559 __setup("netdev=", netdev_boot_setup);
560 
561 /*******************************************************************************
562 
563 			    Device Interface Subroutines
564 
565 *******************************************************************************/
566 
567 /**
568  *	__dev_get_by_name	- find a device by its name
569  *	@net: the applicable net namespace
570  *	@name: name to find
571  *
572  *	Find an interface by name. Must be called under RTNL semaphore
573  *	or @dev_base_lock. If the name is found a pointer to the device
574  *	is returned. If the name is not found then %NULL is returned. The
575  *	reference counters are not incremented so the caller must be
576  *	careful with locks.
577  */
578 
579 struct net_device *__dev_get_by_name(struct net *net, const char *name)
580 {
581 	struct hlist_node *p;
582 
583 	hlist_for_each(p, dev_name_hash(net, name)) {
584 		struct net_device *dev
585 			= hlist_entry(p, struct net_device, name_hlist);
586 		if (!strncmp(dev->name, name, IFNAMSIZ))
587 			return dev;
588 	}
589 	return NULL;
590 }
591 
592 /**
593  *	dev_get_by_name		- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. This can be called from any
598  *	context and does its own locking. The returned handle has
599  *	the usage count incremented and the caller must use dev_put() to
600  *	release it when it is no longer needed. %NULL is returned if no
601  *	matching device is found.
602  */
603 
604 struct net_device *dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct net_device *dev;
607 
608 	read_lock(&dev_base_lock);
609 	dev = __dev_get_by_name(net, name);
610 	if (dev)
611 		dev_hold(dev);
612 	read_unlock(&dev_base_lock);
613 	return dev;
614 }
615 
616 /**
617  *	__dev_get_by_index - find a device by its ifindex
618  *	@net: the applicable net namespace
619  *	@ifindex: index of device
620  *
621  *	Search for an interface by index. Returns %NULL if the device
622  *	is not found or a pointer to the device. The device has not
623  *	had its reference counter increased so the caller must be careful
624  *	about locking. The caller must hold either the RTNL semaphore
625  *	or @dev_base_lock.
626  */
627 
628 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
629 {
630 	struct hlist_node *p;
631 
632 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
633 		struct net_device *dev
634 			= hlist_entry(p, struct net_device, index_hlist);
635 		if (dev->ifindex == ifindex)
636 			return dev;
637 	}
638 	return NULL;
639 }
640 
641 
642 /**
643  *	dev_get_by_index - find a device by its ifindex
644  *	@net: the applicable net namespace
645  *	@ifindex: index of device
646  *
647  *	Search for an interface by index. Returns NULL if the device
648  *	is not found or a pointer to the device. The device returned has
649  *	had a reference added and the pointer is safe until the user calls
650  *	dev_put to indicate they have finished with it.
651  */
652 
653 struct net_device *dev_get_by_index(struct net *net, int ifindex)
654 {
655 	struct net_device *dev;
656 
657 	read_lock(&dev_base_lock);
658 	dev = __dev_get_by_index(net, ifindex);
659 	if (dev)
660 		dev_hold(dev);
661 	read_unlock(&dev_base_lock);
662 	return dev;
663 }
664 
665 /**
666  *	dev_getbyhwaddr - find a device by its hardware address
667  *	@net: the applicable net namespace
668  *	@type: media type of device
669  *	@ha: hardware address
670  *
671  *	Search for an interface by MAC address. Returns NULL if the device
672  *	is not found or a pointer to the device. The caller must hold the
673  *	rtnl semaphore. The returned device has not had its ref count increased
674  *	and the caller must therefore be careful about locking
675  *
676  *	BUGS:
677  *	If the API was consistent this would be __dev_get_by_hwaddr
678  */
679 
680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
681 {
682 	struct net_device *dev;
683 
684 	ASSERT_RTNL();
685 
686 	for_each_netdev(net, dev)
687 		if (dev->type == type &&
688 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
689 			return dev;
690 
691 	return NULL;
692 }
693 
694 EXPORT_SYMBOL(dev_getbyhwaddr);
695 
696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
697 {
698 	struct net_device *dev;
699 
700 	ASSERT_RTNL();
701 	for_each_netdev(net, dev)
702 		if (dev->type == type)
703 			return dev;
704 
705 	return NULL;
706 }
707 
708 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
709 
710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
711 {
712 	struct net_device *dev;
713 
714 	rtnl_lock();
715 	dev = __dev_getfirstbyhwtype(net, type);
716 	if (dev)
717 		dev_hold(dev);
718 	rtnl_unlock();
719 	return dev;
720 }
721 
722 EXPORT_SYMBOL(dev_getfirstbyhwtype);
723 
724 /**
725  *	dev_get_by_flags - find any device with given flags
726  *	@net: the applicable net namespace
727  *	@if_flags: IFF_* values
728  *	@mask: bitmask of bits in if_flags to check
729  *
730  *	Search for any interface with the given flags. Returns NULL if a device
731  *	is not found or a pointer to the device. The device returned has
732  *	had a reference added and the pointer is safe until the user calls
733  *	dev_put to indicate they have finished with it.
734  */
735 
736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
737 {
738 	struct net_device *dev, *ret;
739 
740 	ret = NULL;
741 	read_lock(&dev_base_lock);
742 	for_each_netdev(net, dev) {
743 		if (((dev->flags ^ if_flags) & mask) == 0) {
744 			dev_hold(dev);
745 			ret = dev;
746 			break;
747 		}
748 	}
749 	read_unlock(&dev_base_lock);
750 	return ret;
751 }
752 
753 /**
754  *	dev_valid_name - check if name is okay for network device
755  *	@name: name string
756  *
757  *	Network device names need to be valid file names to
758  *	to allow sysfs to work.  We also disallow any kind of
759  *	whitespace.
760  */
761 int dev_valid_name(const char *name)
762 {
763 	if (*name == '\0')
764 		return 0;
765 	if (strlen(name) >= IFNAMSIZ)
766 		return 0;
767 	if (!strcmp(name, ".") || !strcmp(name, ".."))
768 		return 0;
769 
770 	while (*name) {
771 		if (*name == '/' || isspace(*name))
772 			return 0;
773 		name++;
774 	}
775 	return 1;
776 }
777 
778 /**
779  *	__dev_alloc_name - allocate a name for a device
780  *	@net: network namespace to allocate the device name in
781  *	@name: name format string
782  *	@buf:  scratch buffer and result name string
783  *
784  *	Passed a format string - eg "lt%d" it will try and find a suitable
785  *	id. It scans list of devices to build up a free map, then chooses
786  *	the first empty slot. The caller must hold the dev_base or rtnl lock
787  *	while allocating the name and adding the device in order to avoid
788  *	duplicates.
789  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
790  *	Returns the number of the unit assigned or a negative errno code.
791  */
792 
793 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
794 {
795 	int i = 0;
796 	const char *p;
797 	const int max_netdevices = 8*PAGE_SIZE;
798 	unsigned long *inuse;
799 	struct net_device *d;
800 
801 	p = strnchr(name, IFNAMSIZ-1, '%');
802 	if (p) {
803 		/*
804 		 * Verify the string as this thing may have come from
805 		 * the user.  There must be either one "%d" and no other "%"
806 		 * characters.
807 		 */
808 		if (p[1] != 'd' || strchr(p + 2, '%'))
809 			return -EINVAL;
810 
811 		/* Use one page as a bit array of possible slots */
812 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
813 		if (!inuse)
814 			return -ENOMEM;
815 
816 		for_each_netdev(net, d) {
817 			if (!sscanf(d->name, name, &i))
818 				continue;
819 			if (i < 0 || i >= max_netdevices)
820 				continue;
821 
822 			/*  avoid cases where sscanf is not exact inverse of printf */
823 			snprintf(buf, IFNAMSIZ, name, i);
824 			if (!strncmp(buf, d->name, IFNAMSIZ))
825 				set_bit(i, inuse);
826 		}
827 
828 		i = find_first_zero_bit(inuse, max_netdevices);
829 		free_page((unsigned long) inuse);
830 	}
831 
832 	snprintf(buf, IFNAMSIZ, name, i);
833 	if (!__dev_get_by_name(net, buf))
834 		return i;
835 
836 	/* It is possible to run out of possible slots
837 	 * when the name is long and there isn't enough space left
838 	 * for the digits, or if all bits are used.
839 	 */
840 	return -ENFILE;
841 }
842 
843 /**
844  *	dev_alloc_name - allocate a name for a device
845  *	@dev: device
846  *	@name: name format string
847  *
848  *	Passed a format string - eg "lt%d" it will try and find a suitable
849  *	id. It scans list of devices to build up a free map, then chooses
850  *	the first empty slot. The caller must hold the dev_base or rtnl lock
851  *	while allocating the name and adding the device in order to avoid
852  *	duplicates.
853  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
854  *	Returns the number of the unit assigned or a negative errno code.
855  */
856 
857 int dev_alloc_name(struct net_device *dev, const char *name)
858 {
859 	char buf[IFNAMSIZ];
860 	struct net *net;
861 	int ret;
862 
863 	BUG_ON(!dev_net(dev));
864 	net = dev_net(dev);
865 	ret = __dev_alloc_name(net, name, buf);
866 	if (ret >= 0)
867 		strlcpy(dev->name, buf, IFNAMSIZ);
868 	return ret;
869 }
870 
871 
872 /**
873  *	dev_change_name - change name of a device
874  *	@dev: device
875  *	@newname: name (or format string) must be at least IFNAMSIZ
876  *
877  *	Change name of a device, can pass format strings "eth%d".
878  *	for wildcarding.
879  */
880 int dev_change_name(struct net_device *dev, const char *newname)
881 {
882 	char oldname[IFNAMSIZ];
883 	int err = 0;
884 	int ret;
885 	struct net *net;
886 
887 	ASSERT_RTNL();
888 	BUG_ON(!dev_net(dev));
889 
890 	net = dev_net(dev);
891 	if (dev->flags & IFF_UP)
892 		return -EBUSY;
893 
894 	if (!dev_valid_name(newname))
895 		return -EINVAL;
896 
897 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
898 		return 0;
899 
900 	memcpy(oldname, dev->name, IFNAMSIZ);
901 
902 	if (strchr(newname, '%')) {
903 		err = dev_alloc_name(dev, newname);
904 		if (err < 0)
905 			return err;
906 	}
907 	else if (__dev_get_by_name(net, newname))
908 		return -EEXIST;
909 	else
910 		strlcpy(dev->name, newname, IFNAMSIZ);
911 
912 rollback:
913 	/* For now only devices in the initial network namespace
914 	 * are in sysfs.
915 	 */
916 	if (net == &init_net) {
917 		ret = device_rename(&dev->dev, dev->name);
918 		if (ret) {
919 			memcpy(dev->name, oldname, IFNAMSIZ);
920 			return ret;
921 		}
922 	}
923 
924 	write_lock_bh(&dev_base_lock);
925 	hlist_del(&dev->name_hlist);
926 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
927 	write_unlock_bh(&dev_base_lock);
928 
929 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
930 	ret = notifier_to_errno(ret);
931 
932 	if (ret) {
933 		if (err) {
934 			printk(KERN_ERR
935 			       "%s: name change rollback failed: %d.\n",
936 			       dev->name, ret);
937 		} else {
938 			err = ret;
939 			memcpy(dev->name, oldname, IFNAMSIZ);
940 			goto rollback;
941 		}
942 	}
943 
944 	return err;
945 }
946 
947 /**
948  *	dev_set_alias - change ifalias of a device
949  *	@dev: device
950  *	@alias: name up to IFALIASZ
951  *	@len: limit of bytes to copy from info
952  *
953  *	Set ifalias for a device,
954  */
955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
956 {
957 	ASSERT_RTNL();
958 
959 	if (len >= IFALIASZ)
960 		return -EINVAL;
961 
962 	if (!len) {
963 		if (dev->ifalias) {
964 			kfree(dev->ifalias);
965 			dev->ifalias = NULL;
966 		}
967 		return 0;
968 	}
969 
970 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
971 	if (!dev->ifalias)
972 		return -ENOMEM;
973 
974 	strlcpy(dev->ifalias, alias, len+1);
975 	return len;
976 }
977 
978 
979 /**
980  *	netdev_features_change - device changes features
981  *	@dev: device to cause notification
982  *
983  *	Called to indicate a device has changed features.
984  */
985 void netdev_features_change(struct net_device *dev)
986 {
987 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
988 }
989 EXPORT_SYMBOL(netdev_features_change);
990 
991 /**
992  *	netdev_state_change - device changes state
993  *	@dev: device to cause notification
994  *
995  *	Called to indicate a device has changed state. This function calls
996  *	the notifier chains for netdev_chain and sends a NEWLINK message
997  *	to the routing socket.
998  */
999 void netdev_state_change(struct net_device *dev)
1000 {
1001 	if (dev->flags & IFF_UP) {
1002 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004 	}
1005 }
1006 
1007 void netdev_bonding_change(struct net_device *dev)
1008 {
1009 	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010 }
1011 EXPORT_SYMBOL(netdev_bonding_change);
1012 
1013 /**
1014  *	dev_load 	- load a network module
1015  *	@net: the applicable net namespace
1016  *	@name: name of interface
1017  *
1018  *	If a network interface is not present and the process has suitable
1019  *	privileges this function loads the module. If module loading is not
1020  *	available in this kernel then it becomes a nop.
1021  */
1022 
1023 void dev_load(struct net *net, const char *name)
1024 {
1025 	struct net_device *dev;
1026 
1027 	read_lock(&dev_base_lock);
1028 	dev = __dev_get_by_name(net, name);
1029 	read_unlock(&dev_base_lock);
1030 
1031 	if (!dev && capable(CAP_SYS_MODULE))
1032 		request_module("%s", name);
1033 }
1034 
1035 /**
1036  *	dev_open	- prepare an interface for use.
1037  *	@dev:	device to open
1038  *
1039  *	Takes a device from down to up state. The device's private open
1040  *	function is invoked and then the multicast lists are loaded. Finally
1041  *	the device is moved into the up state and a %NETDEV_UP message is
1042  *	sent to the netdev notifier chain.
1043  *
1044  *	Calling this function on an active interface is a nop. On a failure
1045  *	a negative errno code is returned.
1046  */
1047 int dev_open(struct net_device *dev)
1048 {
1049 	const struct net_device_ops *ops = dev->netdev_ops;
1050 	int ret = 0;
1051 
1052 	ASSERT_RTNL();
1053 
1054 	/*
1055 	 *	Is it already up?
1056 	 */
1057 
1058 	if (dev->flags & IFF_UP)
1059 		return 0;
1060 
1061 	/*
1062 	 *	Is it even present?
1063 	 */
1064 	if (!netif_device_present(dev))
1065 		return -ENODEV;
1066 
1067 	/*
1068 	 *	Call device private open method
1069 	 */
1070 	set_bit(__LINK_STATE_START, &dev->state);
1071 
1072 	if (ops->ndo_validate_addr)
1073 		ret = ops->ndo_validate_addr(dev);
1074 
1075 	if (!ret && ops->ndo_open)
1076 		ret = ops->ndo_open(dev);
1077 
1078 	/*
1079 	 *	If it went open OK then:
1080 	 */
1081 
1082 	if (ret)
1083 		clear_bit(__LINK_STATE_START, &dev->state);
1084 	else {
1085 		/*
1086 		 *	Set the flags.
1087 		 */
1088 		dev->flags |= IFF_UP;
1089 
1090 		/*
1091 		 *	Enable NET_DMA
1092 		 */
1093 		net_dmaengine_get();
1094 
1095 		/*
1096 		 *	Initialize multicasting status
1097 		 */
1098 		dev_set_rx_mode(dev);
1099 
1100 		/*
1101 		 *	Wakeup transmit queue engine
1102 		 */
1103 		dev_activate(dev);
1104 
1105 		/*
1106 		 *	... and announce new interface.
1107 		 */
1108 		call_netdevice_notifiers(NETDEV_UP, dev);
1109 	}
1110 
1111 	return ret;
1112 }
1113 
1114 /**
1115  *	dev_close - shutdown an interface.
1116  *	@dev: device to shutdown
1117  *
1118  *	This function moves an active device into down state. A
1119  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1120  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1121  *	chain.
1122  */
1123 int dev_close(struct net_device *dev)
1124 {
1125 	const struct net_device_ops *ops = dev->netdev_ops;
1126 	ASSERT_RTNL();
1127 
1128 	might_sleep();
1129 
1130 	if (!(dev->flags & IFF_UP))
1131 		return 0;
1132 
1133 	/*
1134 	 *	Tell people we are going down, so that they can
1135 	 *	prepare to death, when device is still operating.
1136 	 */
1137 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1138 
1139 	clear_bit(__LINK_STATE_START, &dev->state);
1140 
1141 	/* Synchronize to scheduled poll. We cannot touch poll list,
1142 	 * it can be even on different cpu. So just clear netif_running().
1143 	 *
1144 	 * dev->stop() will invoke napi_disable() on all of it's
1145 	 * napi_struct instances on this device.
1146 	 */
1147 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1148 
1149 	dev_deactivate(dev);
1150 
1151 	/*
1152 	 *	Call the device specific close. This cannot fail.
1153 	 *	Only if device is UP
1154 	 *
1155 	 *	We allow it to be called even after a DETACH hot-plug
1156 	 *	event.
1157 	 */
1158 	if (ops->ndo_stop)
1159 		ops->ndo_stop(dev);
1160 
1161 	/*
1162 	 *	Device is now down.
1163 	 */
1164 
1165 	dev->flags &= ~IFF_UP;
1166 
1167 	/*
1168 	 * Tell people we are down
1169 	 */
1170 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1171 
1172 	/*
1173 	 *	Shutdown NET_DMA
1174 	 */
1175 	net_dmaengine_put();
1176 
1177 	return 0;
1178 }
1179 
1180 
1181 /**
1182  *	dev_disable_lro - disable Large Receive Offload on a device
1183  *	@dev: device
1184  *
1185  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1186  *	called under RTNL.  This is needed if received packets may be
1187  *	forwarded to another interface.
1188  */
1189 void dev_disable_lro(struct net_device *dev)
1190 {
1191 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1192 	    dev->ethtool_ops->set_flags) {
1193 		u32 flags = dev->ethtool_ops->get_flags(dev);
1194 		if (flags & ETH_FLAG_LRO) {
1195 			flags &= ~ETH_FLAG_LRO;
1196 			dev->ethtool_ops->set_flags(dev, flags);
1197 		}
1198 	}
1199 	WARN_ON(dev->features & NETIF_F_LRO);
1200 }
1201 EXPORT_SYMBOL(dev_disable_lro);
1202 
1203 
1204 static int dev_boot_phase = 1;
1205 
1206 /*
1207  *	Device change register/unregister. These are not inline or static
1208  *	as we export them to the world.
1209  */
1210 
1211 /**
1212  *	register_netdevice_notifier - register a network notifier block
1213  *	@nb: notifier
1214  *
1215  *	Register a notifier to be called when network device events occur.
1216  *	The notifier passed is linked into the kernel structures and must
1217  *	not be reused until it has been unregistered. A negative errno code
1218  *	is returned on a failure.
1219  *
1220  * 	When registered all registration and up events are replayed
1221  *	to the new notifier to allow device to have a race free
1222  *	view of the network device list.
1223  */
1224 
1225 int register_netdevice_notifier(struct notifier_block *nb)
1226 {
1227 	struct net_device *dev;
1228 	struct net_device *last;
1229 	struct net *net;
1230 	int err;
1231 
1232 	rtnl_lock();
1233 	err = raw_notifier_chain_register(&netdev_chain, nb);
1234 	if (err)
1235 		goto unlock;
1236 	if (dev_boot_phase)
1237 		goto unlock;
1238 	for_each_net(net) {
1239 		for_each_netdev(net, dev) {
1240 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1241 			err = notifier_to_errno(err);
1242 			if (err)
1243 				goto rollback;
1244 
1245 			if (!(dev->flags & IFF_UP))
1246 				continue;
1247 
1248 			nb->notifier_call(nb, NETDEV_UP, dev);
1249 		}
1250 	}
1251 
1252 unlock:
1253 	rtnl_unlock();
1254 	return err;
1255 
1256 rollback:
1257 	last = dev;
1258 	for_each_net(net) {
1259 		for_each_netdev(net, dev) {
1260 			if (dev == last)
1261 				break;
1262 
1263 			if (dev->flags & IFF_UP) {
1264 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1265 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1266 			}
1267 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1268 		}
1269 	}
1270 
1271 	raw_notifier_chain_unregister(&netdev_chain, nb);
1272 	goto unlock;
1273 }
1274 
1275 /**
1276  *	unregister_netdevice_notifier - unregister a network notifier block
1277  *	@nb: notifier
1278  *
1279  *	Unregister a notifier previously registered by
1280  *	register_netdevice_notifier(). The notifier is unlinked into the
1281  *	kernel structures and may then be reused. A negative errno code
1282  *	is returned on a failure.
1283  */
1284 
1285 int unregister_netdevice_notifier(struct notifier_block *nb)
1286 {
1287 	int err;
1288 
1289 	rtnl_lock();
1290 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1291 	rtnl_unlock();
1292 	return err;
1293 }
1294 
1295 /**
1296  *	call_netdevice_notifiers - call all network notifier blocks
1297  *      @val: value passed unmodified to notifier function
1298  *      @dev: net_device pointer passed unmodified to notifier function
1299  *
1300  *	Call all network notifier blocks.  Parameters and return value
1301  *	are as for raw_notifier_call_chain().
1302  */
1303 
1304 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1305 {
1306 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1307 }
1308 
1309 /* When > 0 there are consumers of rx skb time stamps */
1310 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1311 
1312 void net_enable_timestamp(void)
1313 {
1314 	atomic_inc(&netstamp_needed);
1315 }
1316 
1317 void net_disable_timestamp(void)
1318 {
1319 	atomic_dec(&netstamp_needed);
1320 }
1321 
1322 static inline void net_timestamp(struct sk_buff *skb)
1323 {
1324 	if (atomic_read(&netstamp_needed))
1325 		__net_timestamp(skb);
1326 	else
1327 		skb->tstamp.tv64 = 0;
1328 }
1329 
1330 /*
1331  *	Support routine. Sends outgoing frames to any network
1332  *	taps currently in use.
1333  */
1334 
1335 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1336 {
1337 	struct packet_type *ptype;
1338 
1339 	net_timestamp(skb);
1340 
1341 	rcu_read_lock();
1342 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1343 		/* Never send packets back to the socket
1344 		 * they originated from - MvS (miquels@drinkel.ow.org)
1345 		 */
1346 		if ((ptype->dev == dev || !ptype->dev) &&
1347 		    (ptype->af_packet_priv == NULL ||
1348 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1349 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1350 			if (!skb2)
1351 				break;
1352 
1353 			/* skb->nh should be correctly
1354 			   set by sender, so that the second statement is
1355 			   just protection against buggy protocols.
1356 			 */
1357 			skb_reset_mac_header(skb2);
1358 
1359 			if (skb_network_header(skb2) < skb2->data ||
1360 			    skb2->network_header > skb2->tail) {
1361 				if (net_ratelimit())
1362 					printk(KERN_CRIT "protocol %04x is "
1363 					       "buggy, dev %s\n",
1364 					       skb2->protocol, dev->name);
1365 				skb_reset_network_header(skb2);
1366 			}
1367 
1368 			skb2->transport_header = skb2->network_header;
1369 			skb2->pkt_type = PACKET_OUTGOING;
1370 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1371 		}
1372 	}
1373 	rcu_read_unlock();
1374 }
1375 
1376 
1377 static inline void __netif_reschedule(struct Qdisc *q)
1378 {
1379 	struct softnet_data *sd;
1380 	unsigned long flags;
1381 
1382 	local_irq_save(flags);
1383 	sd = &__get_cpu_var(softnet_data);
1384 	q->next_sched = sd->output_queue;
1385 	sd->output_queue = q;
1386 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1387 	local_irq_restore(flags);
1388 }
1389 
1390 void __netif_schedule(struct Qdisc *q)
1391 {
1392 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1393 		__netif_reschedule(q);
1394 }
1395 EXPORT_SYMBOL(__netif_schedule);
1396 
1397 void dev_kfree_skb_irq(struct sk_buff *skb)
1398 {
1399 	if (atomic_dec_and_test(&skb->users)) {
1400 		struct softnet_data *sd;
1401 		unsigned long flags;
1402 
1403 		local_irq_save(flags);
1404 		sd = &__get_cpu_var(softnet_data);
1405 		skb->next = sd->completion_queue;
1406 		sd->completion_queue = skb;
1407 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1408 		local_irq_restore(flags);
1409 	}
1410 }
1411 EXPORT_SYMBOL(dev_kfree_skb_irq);
1412 
1413 void dev_kfree_skb_any(struct sk_buff *skb)
1414 {
1415 	if (in_irq() || irqs_disabled())
1416 		dev_kfree_skb_irq(skb);
1417 	else
1418 		dev_kfree_skb(skb);
1419 }
1420 EXPORT_SYMBOL(dev_kfree_skb_any);
1421 
1422 
1423 /**
1424  * netif_device_detach - mark device as removed
1425  * @dev: network device
1426  *
1427  * Mark device as removed from system and therefore no longer available.
1428  */
1429 void netif_device_detach(struct net_device *dev)
1430 {
1431 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1432 	    netif_running(dev)) {
1433 		netif_stop_queue(dev);
1434 	}
1435 }
1436 EXPORT_SYMBOL(netif_device_detach);
1437 
1438 /**
1439  * netif_device_attach - mark device as attached
1440  * @dev: network device
1441  *
1442  * Mark device as attached from system and restart if needed.
1443  */
1444 void netif_device_attach(struct net_device *dev)
1445 {
1446 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1447 	    netif_running(dev)) {
1448 		netif_wake_queue(dev);
1449 		__netdev_watchdog_up(dev);
1450 	}
1451 }
1452 EXPORT_SYMBOL(netif_device_attach);
1453 
1454 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1455 {
1456 	return ((features & NETIF_F_GEN_CSUM) ||
1457 		((features & NETIF_F_IP_CSUM) &&
1458 		 protocol == htons(ETH_P_IP)) ||
1459 		((features & NETIF_F_IPV6_CSUM) &&
1460 		 protocol == htons(ETH_P_IPV6)));
1461 }
1462 
1463 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1464 {
1465 	if (can_checksum_protocol(dev->features, skb->protocol))
1466 		return true;
1467 
1468 	if (skb->protocol == htons(ETH_P_8021Q)) {
1469 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1470 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1471 					  veh->h_vlan_encapsulated_proto))
1472 			return true;
1473 	}
1474 
1475 	return false;
1476 }
1477 
1478 /*
1479  * Invalidate hardware checksum when packet is to be mangled, and
1480  * complete checksum manually on outgoing path.
1481  */
1482 int skb_checksum_help(struct sk_buff *skb)
1483 {
1484 	__wsum csum;
1485 	int ret = 0, offset;
1486 
1487 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1488 		goto out_set_summed;
1489 
1490 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1491 		/* Let GSO fix up the checksum. */
1492 		goto out_set_summed;
1493 	}
1494 
1495 	offset = skb->csum_start - skb_headroom(skb);
1496 	BUG_ON(offset >= skb_headlen(skb));
1497 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1498 
1499 	offset += skb->csum_offset;
1500 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1501 
1502 	if (skb_cloned(skb) &&
1503 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1504 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1505 		if (ret)
1506 			goto out;
1507 	}
1508 
1509 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1510 out_set_summed:
1511 	skb->ip_summed = CHECKSUM_NONE;
1512 out:
1513 	return ret;
1514 }
1515 
1516 /**
1517  *	skb_gso_segment - Perform segmentation on skb.
1518  *	@skb: buffer to segment
1519  *	@features: features for the output path (see dev->features)
1520  *
1521  *	This function segments the given skb and returns a list of segments.
1522  *
1523  *	It may return NULL if the skb requires no segmentation.  This is
1524  *	only possible when GSO is used for verifying header integrity.
1525  */
1526 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1527 {
1528 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1529 	struct packet_type *ptype;
1530 	__be16 type = skb->protocol;
1531 	int err;
1532 
1533 	skb_reset_mac_header(skb);
1534 	skb->mac_len = skb->network_header - skb->mac_header;
1535 	__skb_pull(skb, skb->mac_len);
1536 
1537 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1538 		struct net_device *dev = skb->dev;
1539 		struct ethtool_drvinfo info = {};
1540 
1541 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1542 			dev->ethtool_ops->get_drvinfo(dev, &info);
1543 
1544 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1545 			"ip_summed=%d",
1546 		     info.driver, dev ? dev->features : 0L,
1547 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1548 		     skb->len, skb->data_len, skb->ip_summed);
1549 
1550 		if (skb_header_cloned(skb) &&
1551 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1552 			return ERR_PTR(err);
1553 	}
1554 
1555 	rcu_read_lock();
1556 	list_for_each_entry_rcu(ptype,
1557 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1558 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1559 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1560 				err = ptype->gso_send_check(skb);
1561 				segs = ERR_PTR(err);
1562 				if (err || skb_gso_ok(skb, features))
1563 					break;
1564 				__skb_push(skb, (skb->data -
1565 						 skb_network_header(skb)));
1566 			}
1567 			segs = ptype->gso_segment(skb, features);
1568 			break;
1569 		}
1570 	}
1571 	rcu_read_unlock();
1572 
1573 	__skb_push(skb, skb->data - skb_mac_header(skb));
1574 
1575 	return segs;
1576 }
1577 
1578 EXPORT_SYMBOL(skb_gso_segment);
1579 
1580 /* Take action when hardware reception checksum errors are detected. */
1581 #ifdef CONFIG_BUG
1582 void netdev_rx_csum_fault(struct net_device *dev)
1583 {
1584 	if (net_ratelimit()) {
1585 		printk(KERN_ERR "%s: hw csum failure.\n",
1586 			dev ? dev->name : "<unknown>");
1587 		dump_stack();
1588 	}
1589 }
1590 EXPORT_SYMBOL(netdev_rx_csum_fault);
1591 #endif
1592 
1593 /* Actually, we should eliminate this check as soon as we know, that:
1594  * 1. IOMMU is present and allows to map all the memory.
1595  * 2. No high memory really exists on this machine.
1596  */
1597 
1598 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1599 {
1600 #ifdef CONFIG_HIGHMEM
1601 	int i;
1602 
1603 	if (dev->features & NETIF_F_HIGHDMA)
1604 		return 0;
1605 
1606 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1607 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1608 			return 1;
1609 
1610 #endif
1611 	return 0;
1612 }
1613 
1614 struct dev_gso_cb {
1615 	void (*destructor)(struct sk_buff *skb);
1616 };
1617 
1618 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1619 
1620 static void dev_gso_skb_destructor(struct sk_buff *skb)
1621 {
1622 	struct dev_gso_cb *cb;
1623 
1624 	do {
1625 		struct sk_buff *nskb = skb->next;
1626 
1627 		skb->next = nskb->next;
1628 		nskb->next = NULL;
1629 		kfree_skb(nskb);
1630 	} while (skb->next);
1631 
1632 	cb = DEV_GSO_CB(skb);
1633 	if (cb->destructor)
1634 		cb->destructor(skb);
1635 }
1636 
1637 /**
1638  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1639  *	@skb: buffer to segment
1640  *
1641  *	This function segments the given skb and stores the list of segments
1642  *	in skb->next.
1643  */
1644 static int dev_gso_segment(struct sk_buff *skb)
1645 {
1646 	struct net_device *dev = skb->dev;
1647 	struct sk_buff *segs;
1648 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1649 					 NETIF_F_SG : 0);
1650 
1651 	segs = skb_gso_segment(skb, features);
1652 
1653 	/* Verifying header integrity only. */
1654 	if (!segs)
1655 		return 0;
1656 
1657 	if (IS_ERR(segs))
1658 		return PTR_ERR(segs);
1659 
1660 	skb->next = segs;
1661 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1662 	skb->destructor = dev_gso_skb_destructor;
1663 
1664 	return 0;
1665 }
1666 
1667 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1668 			struct netdev_queue *txq)
1669 {
1670 	const struct net_device_ops *ops = dev->netdev_ops;
1671 
1672 	prefetch(&dev->netdev_ops->ndo_start_xmit);
1673 	if (likely(!skb->next)) {
1674 		if (!list_empty(&ptype_all))
1675 			dev_queue_xmit_nit(skb, dev);
1676 
1677 		if (netif_needs_gso(dev, skb)) {
1678 			if (unlikely(dev_gso_segment(skb)))
1679 				goto out_kfree_skb;
1680 			if (skb->next)
1681 				goto gso;
1682 		}
1683 
1684 		return ops->ndo_start_xmit(skb, dev);
1685 	}
1686 
1687 gso:
1688 	do {
1689 		struct sk_buff *nskb = skb->next;
1690 		int rc;
1691 
1692 		skb->next = nskb->next;
1693 		nskb->next = NULL;
1694 		rc = ops->ndo_start_xmit(nskb, dev);
1695 		if (unlikely(rc)) {
1696 			nskb->next = skb->next;
1697 			skb->next = nskb;
1698 			return rc;
1699 		}
1700 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1701 			return NETDEV_TX_BUSY;
1702 	} while (skb->next);
1703 
1704 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1705 
1706 out_kfree_skb:
1707 	kfree_skb(skb);
1708 	return 0;
1709 }
1710 
1711 static u32 simple_tx_hashrnd;
1712 static int simple_tx_hashrnd_initialized = 0;
1713 
1714 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1715 {
1716 	u32 addr1, addr2, ports;
1717 	u32 hash, ihl;
1718 	u8 ip_proto = 0;
1719 
1720 	if (unlikely(!simple_tx_hashrnd_initialized)) {
1721 		get_random_bytes(&simple_tx_hashrnd, 4);
1722 		simple_tx_hashrnd_initialized = 1;
1723 	}
1724 
1725 	switch (skb->protocol) {
1726 	case htons(ETH_P_IP):
1727 		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1728 			ip_proto = ip_hdr(skb)->protocol;
1729 		addr1 = ip_hdr(skb)->saddr;
1730 		addr2 = ip_hdr(skb)->daddr;
1731 		ihl = ip_hdr(skb)->ihl;
1732 		break;
1733 	case htons(ETH_P_IPV6):
1734 		ip_proto = ipv6_hdr(skb)->nexthdr;
1735 		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1736 		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1737 		ihl = (40 >> 2);
1738 		break;
1739 	default:
1740 		return 0;
1741 	}
1742 
1743 
1744 	switch (ip_proto) {
1745 	case IPPROTO_TCP:
1746 	case IPPROTO_UDP:
1747 	case IPPROTO_DCCP:
1748 	case IPPROTO_ESP:
1749 	case IPPROTO_AH:
1750 	case IPPROTO_SCTP:
1751 	case IPPROTO_UDPLITE:
1752 		ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1753 		break;
1754 
1755 	default:
1756 		ports = 0;
1757 		break;
1758 	}
1759 
1760 	hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1761 
1762 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1763 }
1764 
1765 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1766 					struct sk_buff *skb)
1767 {
1768 	const struct net_device_ops *ops = dev->netdev_ops;
1769 	u16 queue_index = 0;
1770 
1771 	if (ops->ndo_select_queue)
1772 		queue_index = ops->ndo_select_queue(dev, skb);
1773 	else if (dev->real_num_tx_queues > 1)
1774 		queue_index = simple_tx_hash(dev, skb);
1775 
1776 	skb_set_queue_mapping(skb, queue_index);
1777 	return netdev_get_tx_queue(dev, queue_index);
1778 }
1779 
1780 /**
1781  *	dev_queue_xmit - transmit a buffer
1782  *	@skb: buffer to transmit
1783  *
1784  *	Queue a buffer for transmission to a network device. The caller must
1785  *	have set the device and priority and built the buffer before calling
1786  *	this function. The function can be called from an interrupt.
1787  *
1788  *	A negative errno code is returned on a failure. A success does not
1789  *	guarantee the frame will be transmitted as it may be dropped due
1790  *	to congestion or traffic shaping.
1791  *
1792  * -----------------------------------------------------------------------------------
1793  *      I notice this method can also return errors from the queue disciplines,
1794  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1795  *      be positive.
1796  *
1797  *      Regardless of the return value, the skb is consumed, so it is currently
1798  *      difficult to retry a send to this method.  (You can bump the ref count
1799  *      before sending to hold a reference for retry if you are careful.)
1800  *
1801  *      When calling this method, interrupts MUST be enabled.  This is because
1802  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1803  *          --BLG
1804  */
1805 int dev_queue_xmit(struct sk_buff *skb)
1806 {
1807 	struct net_device *dev = skb->dev;
1808 	struct netdev_queue *txq;
1809 	struct Qdisc *q;
1810 	int rc = -ENOMEM;
1811 
1812 	/* GSO will handle the following emulations directly. */
1813 	if (netif_needs_gso(dev, skb))
1814 		goto gso;
1815 
1816 	if (skb_shinfo(skb)->frag_list &&
1817 	    !(dev->features & NETIF_F_FRAGLIST) &&
1818 	    __skb_linearize(skb))
1819 		goto out_kfree_skb;
1820 
1821 	/* Fragmented skb is linearized if device does not support SG,
1822 	 * or if at least one of fragments is in highmem and device
1823 	 * does not support DMA from it.
1824 	 */
1825 	if (skb_shinfo(skb)->nr_frags &&
1826 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1827 	    __skb_linearize(skb))
1828 		goto out_kfree_skb;
1829 
1830 	/* If packet is not checksummed and device does not support
1831 	 * checksumming for this protocol, complete checksumming here.
1832 	 */
1833 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1834 		skb_set_transport_header(skb, skb->csum_start -
1835 					      skb_headroom(skb));
1836 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1837 			goto out_kfree_skb;
1838 	}
1839 
1840 gso:
1841 	/* Disable soft irqs for various locks below. Also
1842 	 * stops preemption for RCU.
1843 	 */
1844 	rcu_read_lock_bh();
1845 
1846 	txq = dev_pick_tx(dev, skb);
1847 	q = rcu_dereference(txq->qdisc);
1848 
1849 #ifdef CONFIG_NET_CLS_ACT
1850 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1851 #endif
1852 	if (q->enqueue) {
1853 		spinlock_t *root_lock = qdisc_lock(q);
1854 
1855 		spin_lock(root_lock);
1856 
1857 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1858 			kfree_skb(skb);
1859 			rc = NET_XMIT_DROP;
1860 		} else {
1861 			rc = qdisc_enqueue_root(skb, q);
1862 			qdisc_run(q);
1863 		}
1864 		spin_unlock(root_lock);
1865 
1866 		goto out;
1867 	}
1868 
1869 	/* The device has no queue. Common case for software devices:
1870 	   loopback, all the sorts of tunnels...
1871 
1872 	   Really, it is unlikely that netif_tx_lock protection is necessary
1873 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1874 	   counters.)
1875 	   However, it is possible, that they rely on protection
1876 	   made by us here.
1877 
1878 	   Check this and shot the lock. It is not prone from deadlocks.
1879 	   Either shot noqueue qdisc, it is even simpler 8)
1880 	 */
1881 	if (dev->flags & IFF_UP) {
1882 		int cpu = smp_processor_id(); /* ok because BHs are off */
1883 
1884 		if (txq->xmit_lock_owner != cpu) {
1885 
1886 			HARD_TX_LOCK(dev, txq, cpu);
1887 
1888 			if (!netif_tx_queue_stopped(txq)) {
1889 				rc = 0;
1890 				if (!dev_hard_start_xmit(skb, dev, txq)) {
1891 					HARD_TX_UNLOCK(dev, txq);
1892 					goto out;
1893 				}
1894 			}
1895 			HARD_TX_UNLOCK(dev, txq);
1896 			if (net_ratelimit())
1897 				printk(KERN_CRIT "Virtual device %s asks to "
1898 				       "queue packet!\n", dev->name);
1899 		} else {
1900 			/* Recursion is detected! It is possible,
1901 			 * unfortunately */
1902 			if (net_ratelimit())
1903 				printk(KERN_CRIT "Dead loop on virtual device "
1904 				       "%s, fix it urgently!\n", dev->name);
1905 		}
1906 	}
1907 
1908 	rc = -ENETDOWN;
1909 	rcu_read_unlock_bh();
1910 
1911 out_kfree_skb:
1912 	kfree_skb(skb);
1913 	return rc;
1914 out:
1915 	rcu_read_unlock_bh();
1916 	return rc;
1917 }
1918 
1919 
1920 /*=======================================================================
1921 			Receiver routines
1922   =======================================================================*/
1923 
1924 int netdev_max_backlog __read_mostly = 1000;
1925 int netdev_budget __read_mostly = 300;
1926 int weight_p __read_mostly = 64;            /* old backlog weight */
1927 
1928 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1929 
1930 
1931 /**
1932  *	netif_rx	-	post buffer to the network code
1933  *	@skb: buffer to post
1934  *
1935  *	This function receives a packet from a device driver and queues it for
1936  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1937  *	may be dropped during processing for congestion control or by the
1938  *	protocol layers.
1939  *
1940  *	return values:
1941  *	NET_RX_SUCCESS	(no congestion)
1942  *	NET_RX_DROP     (packet was dropped)
1943  *
1944  */
1945 
1946 int netif_rx(struct sk_buff *skb)
1947 {
1948 	struct softnet_data *queue;
1949 	unsigned long flags;
1950 
1951 	/* if netpoll wants it, pretend we never saw it */
1952 	if (netpoll_rx(skb))
1953 		return NET_RX_DROP;
1954 
1955 	if (!skb->tstamp.tv64)
1956 		net_timestamp(skb);
1957 
1958 	/*
1959 	 * The code is rearranged so that the path is the most
1960 	 * short when CPU is congested, but is still operating.
1961 	 */
1962 	local_irq_save(flags);
1963 	queue = &__get_cpu_var(softnet_data);
1964 
1965 	__get_cpu_var(netdev_rx_stat).total++;
1966 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1967 		if (queue->input_pkt_queue.qlen) {
1968 enqueue:
1969 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1970 			local_irq_restore(flags);
1971 			return NET_RX_SUCCESS;
1972 		}
1973 
1974 		napi_schedule(&queue->backlog);
1975 		goto enqueue;
1976 	}
1977 
1978 	__get_cpu_var(netdev_rx_stat).dropped++;
1979 	local_irq_restore(flags);
1980 
1981 	kfree_skb(skb);
1982 	return NET_RX_DROP;
1983 }
1984 
1985 int netif_rx_ni(struct sk_buff *skb)
1986 {
1987 	int err;
1988 
1989 	preempt_disable();
1990 	err = netif_rx(skb);
1991 	if (local_softirq_pending())
1992 		do_softirq();
1993 	preempt_enable();
1994 
1995 	return err;
1996 }
1997 
1998 EXPORT_SYMBOL(netif_rx_ni);
1999 
2000 static void net_tx_action(struct softirq_action *h)
2001 {
2002 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2003 
2004 	if (sd->completion_queue) {
2005 		struct sk_buff *clist;
2006 
2007 		local_irq_disable();
2008 		clist = sd->completion_queue;
2009 		sd->completion_queue = NULL;
2010 		local_irq_enable();
2011 
2012 		while (clist) {
2013 			struct sk_buff *skb = clist;
2014 			clist = clist->next;
2015 
2016 			WARN_ON(atomic_read(&skb->users));
2017 			__kfree_skb(skb);
2018 		}
2019 	}
2020 
2021 	if (sd->output_queue) {
2022 		struct Qdisc *head;
2023 
2024 		local_irq_disable();
2025 		head = sd->output_queue;
2026 		sd->output_queue = NULL;
2027 		local_irq_enable();
2028 
2029 		while (head) {
2030 			struct Qdisc *q = head;
2031 			spinlock_t *root_lock;
2032 
2033 			head = head->next_sched;
2034 
2035 			root_lock = qdisc_lock(q);
2036 			if (spin_trylock(root_lock)) {
2037 				smp_mb__before_clear_bit();
2038 				clear_bit(__QDISC_STATE_SCHED,
2039 					  &q->state);
2040 				qdisc_run(q);
2041 				spin_unlock(root_lock);
2042 			} else {
2043 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2044 					      &q->state)) {
2045 					__netif_reschedule(q);
2046 				} else {
2047 					smp_mb__before_clear_bit();
2048 					clear_bit(__QDISC_STATE_SCHED,
2049 						  &q->state);
2050 				}
2051 			}
2052 		}
2053 	}
2054 }
2055 
2056 static inline int deliver_skb(struct sk_buff *skb,
2057 			      struct packet_type *pt_prev,
2058 			      struct net_device *orig_dev)
2059 {
2060 	atomic_inc(&skb->users);
2061 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2062 }
2063 
2064 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2065 /* These hooks defined here for ATM */
2066 struct net_bridge;
2067 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2068 						unsigned char *addr);
2069 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2070 
2071 /*
2072  * If bridge module is loaded call bridging hook.
2073  *  returns NULL if packet was consumed.
2074  */
2075 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2076 					struct sk_buff *skb) __read_mostly;
2077 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2078 					    struct packet_type **pt_prev, int *ret,
2079 					    struct net_device *orig_dev)
2080 {
2081 	struct net_bridge_port *port;
2082 
2083 	if (skb->pkt_type == PACKET_LOOPBACK ||
2084 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2085 		return skb;
2086 
2087 	if (*pt_prev) {
2088 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2089 		*pt_prev = NULL;
2090 	}
2091 
2092 	return br_handle_frame_hook(port, skb);
2093 }
2094 #else
2095 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2096 #endif
2097 
2098 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2099 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2100 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2101 
2102 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2103 					     struct packet_type **pt_prev,
2104 					     int *ret,
2105 					     struct net_device *orig_dev)
2106 {
2107 	if (skb->dev->macvlan_port == NULL)
2108 		return skb;
2109 
2110 	if (*pt_prev) {
2111 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2112 		*pt_prev = NULL;
2113 	}
2114 	return macvlan_handle_frame_hook(skb);
2115 }
2116 #else
2117 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2118 #endif
2119 
2120 #ifdef CONFIG_NET_CLS_ACT
2121 /* TODO: Maybe we should just force sch_ingress to be compiled in
2122  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2123  * a compare and 2 stores extra right now if we dont have it on
2124  * but have CONFIG_NET_CLS_ACT
2125  * NOTE: This doesnt stop any functionality; if you dont have
2126  * the ingress scheduler, you just cant add policies on ingress.
2127  *
2128  */
2129 static int ing_filter(struct sk_buff *skb)
2130 {
2131 	struct net_device *dev = skb->dev;
2132 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2133 	struct netdev_queue *rxq;
2134 	int result = TC_ACT_OK;
2135 	struct Qdisc *q;
2136 
2137 	if (MAX_RED_LOOP < ttl++) {
2138 		printk(KERN_WARNING
2139 		       "Redir loop detected Dropping packet (%d->%d)\n",
2140 		       skb->iif, dev->ifindex);
2141 		return TC_ACT_SHOT;
2142 	}
2143 
2144 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2145 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2146 
2147 	rxq = &dev->rx_queue;
2148 
2149 	q = rxq->qdisc;
2150 	if (q != &noop_qdisc) {
2151 		spin_lock(qdisc_lock(q));
2152 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2153 			result = qdisc_enqueue_root(skb, q);
2154 		spin_unlock(qdisc_lock(q));
2155 	}
2156 
2157 	return result;
2158 }
2159 
2160 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2161 					 struct packet_type **pt_prev,
2162 					 int *ret, struct net_device *orig_dev)
2163 {
2164 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2165 		goto out;
2166 
2167 	if (*pt_prev) {
2168 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2169 		*pt_prev = NULL;
2170 	} else {
2171 		/* Huh? Why does turning on AF_PACKET affect this? */
2172 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2173 	}
2174 
2175 	switch (ing_filter(skb)) {
2176 	case TC_ACT_SHOT:
2177 	case TC_ACT_STOLEN:
2178 		kfree_skb(skb);
2179 		return NULL;
2180 	}
2181 
2182 out:
2183 	skb->tc_verd = 0;
2184 	return skb;
2185 }
2186 #endif
2187 
2188 /*
2189  * 	netif_nit_deliver - deliver received packets to network taps
2190  * 	@skb: buffer
2191  *
2192  * 	This function is used to deliver incoming packets to network
2193  * 	taps. It should be used when the normal netif_receive_skb path
2194  * 	is bypassed, for example because of VLAN acceleration.
2195  */
2196 void netif_nit_deliver(struct sk_buff *skb)
2197 {
2198 	struct packet_type *ptype;
2199 
2200 	if (list_empty(&ptype_all))
2201 		return;
2202 
2203 	skb_reset_network_header(skb);
2204 	skb_reset_transport_header(skb);
2205 	skb->mac_len = skb->network_header - skb->mac_header;
2206 
2207 	rcu_read_lock();
2208 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2209 		if (!ptype->dev || ptype->dev == skb->dev)
2210 			deliver_skb(skb, ptype, skb->dev);
2211 	}
2212 	rcu_read_unlock();
2213 }
2214 
2215 /**
2216  *	netif_receive_skb - process receive buffer from network
2217  *	@skb: buffer to process
2218  *
2219  *	netif_receive_skb() is the main receive data processing function.
2220  *	It always succeeds. The buffer may be dropped during processing
2221  *	for congestion control or by the protocol layers.
2222  *
2223  *	This function may only be called from softirq context and interrupts
2224  *	should be enabled.
2225  *
2226  *	Return values (usually ignored):
2227  *	NET_RX_SUCCESS: no congestion
2228  *	NET_RX_DROP: packet was dropped
2229  */
2230 int netif_receive_skb(struct sk_buff *skb)
2231 {
2232 	struct packet_type *ptype, *pt_prev;
2233 	struct net_device *orig_dev;
2234 	struct net_device *null_or_orig;
2235 	int ret = NET_RX_DROP;
2236 	__be16 type;
2237 
2238 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2239 		return NET_RX_SUCCESS;
2240 
2241 	/* if we've gotten here through NAPI, check netpoll */
2242 	if (netpoll_receive_skb(skb))
2243 		return NET_RX_DROP;
2244 
2245 	if (!skb->tstamp.tv64)
2246 		net_timestamp(skb);
2247 
2248 	if (!skb->iif)
2249 		skb->iif = skb->dev->ifindex;
2250 
2251 	null_or_orig = NULL;
2252 	orig_dev = skb->dev;
2253 	if (orig_dev->master) {
2254 		if (skb_bond_should_drop(skb))
2255 			null_or_orig = orig_dev; /* deliver only exact match */
2256 		else
2257 			skb->dev = orig_dev->master;
2258 	}
2259 
2260 	__get_cpu_var(netdev_rx_stat).total++;
2261 
2262 	skb_reset_network_header(skb);
2263 	skb_reset_transport_header(skb);
2264 	skb->mac_len = skb->network_header - skb->mac_header;
2265 
2266 	pt_prev = NULL;
2267 
2268 	rcu_read_lock();
2269 
2270 	/* Don't receive packets in an exiting network namespace */
2271 	if (!net_alive(dev_net(skb->dev))) {
2272 		kfree_skb(skb);
2273 		goto out;
2274 	}
2275 
2276 #ifdef CONFIG_NET_CLS_ACT
2277 	if (skb->tc_verd & TC_NCLS) {
2278 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2279 		goto ncls;
2280 	}
2281 #endif
2282 
2283 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2284 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2285 		    ptype->dev == orig_dev) {
2286 			if (pt_prev)
2287 				ret = deliver_skb(skb, pt_prev, orig_dev);
2288 			pt_prev = ptype;
2289 		}
2290 	}
2291 
2292 #ifdef CONFIG_NET_CLS_ACT
2293 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2294 	if (!skb)
2295 		goto out;
2296 ncls:
2297 #endif
2298 
2299 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2300 	if (!skb)
2301 		goto out;
2302 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2303 	if (!skb)
2304 		goto out;
2305 
2306 	type = skb->protocol;
2307 	list_for_each_entry_rcu(ptype,
2308 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2309 		if (ptype->type == type &&
2310 		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2311 		     ptype->dev == orig_dev)) {
2312 			if (pt_prev)
2313 				ret = deliver_skb(skb, pt_prev, orig_dev);
2314 			pt_prev = ptype;
2315 		}
2316 	}
2317 
2318 	if (pt_prev) {
2319 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2320 	} else {
2321 		kfree_skb(skb);
2322 		/* Jamal, now you will not able to escape explaining
2323 		 * me how you were going to use this. :-)
2324 		 */
2325 		ret = NET_RX_DROP;
2326 	}
2327 
2328 out:
2329 	rcu_read_unlock();
2330 	return ret;
2331 }
2332 
2333 /* Network device is going away, flush any packets still pending  */
2334 static void flush_backlog(void *arg)
2335 {
2336 	struct net_device *dev = arg;
2337 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2338 	struct sk_buff *skb, *tmp;
2339 
2340 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2341 		if (skb->dev == dev) {
2342 			__skb_unlink(skb, &queue->input_pkt_queue);
2343 			kfree_skb(skb);
2344 		}
2345 }
2346 
2347 static int napi_gro_complete(struct sk_buff *skb)
2348 {
2349 	struct packet_type *ptype;
2350 	__be16 type = skb->protocol;
2351 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2352 	int err = -ENOENT;
2353 
2354 	if (NAPI_GRO_CB(skb)->count == 1)
2355 		goto out;
2356 
2357 	rcu_read_lock();
2358 	list_for_each_entry_rcu(ptype, head, list) {
2359 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2360 			continue;
2361 
2362 		err = ptype->gro_complete(skb);
2363 		break;
2364 	}
2365 	rcu_read_unlock();
2366 
2367 	if (err) {
2368 		WARN_ON(&ptype->list == head);
2369 		kfree_skb(skb);
2370 		return NET_RX_SUCCESS;
2371 	}
2372 
2373 out:
2374 	skb_shinfo(skb)->gso_size = 0;
2375 	__skb_push(skb, -skb_network_offset(skb));
2376 	return netif_receive_skb(skb);
2377 }
2378 
2379 void napi_gro_flush(struct napi_struct *napi)
2380 {
2381 	struct sk_buff *skb, *next;
2382 
2383 	for (skb = napi->gro_list; skb; skb = next) {
2384 		next = skb->next;
2385 		skb->next = NULL;
2386 		napi_gro_complete(skb);
2387 	}
2388 
2389 	napi->gro_list = NULL;
2390 }
2391 EXPORT_SYMBOL(napi_gro_flush);
2392 
2393 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2394 {
2395 	struct sk_buff **pp = NULL;
2396 	struct packet_type *ptype;
2397 	__be16 type = skb->protocol;
2398 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2399 	int count = 0;
2400 	int same_flow;
2401 	int mac_len;
2402 	int free;
2403 
2404 	if (!(skb->dev->features & NETIF_F_GRO))
2405 		goto normal;
2406 
2407 	if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2408 		goto normal;
2409 
2410 	rcu_read_lock();
2411 	list_for_each_entry_rcu(ptype, head, list) {
2412 		struct sk_buff *p;
2413 
2414 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2415 			continue;
2416 
2417 		skb_reset_network_header(skb);
2418 		mac_len = skb->network_header - skb->mac_header;
2419 		skb->mac_len = mac_len;
2420 		NAPI_GRO_CB(skb)->same_flow = 0;
2421 		NAPI_GRO_CB(skb)->flush = 0;
2422 		NAPI_GRO_CB(skb)->free = 0;
2423 
2424 		for (p = napi->gro_list; p; p = p->next) {
2425 			count++;
2426 
2427 			if (!NAPI_GRO_CB(p)->same_flow)
2428 				continue;
2429 
2430 			if (p->mac_len != mac_len ||
2431 			    memcmp(skb_mac_header(p), skb_mac_header(skb),
2432 				   mac_len))
2433 				NAPI_GRO_CB(p)->same_flow = 0;
2434 		}
2435 
2436 		pp = ptype->gro_receive(&napi->gro_list, skb);
2437 		break;
2438 	}
2439 	rcu_read_unlock();
2440 
2441 	if (&ptype->list == head)
2442 		goto normal;
2443 
2444 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2445 	free = NAPI_GRO_CB(skb)->free;
2446 
2447 	if (pp) {
2448 		struct sk_buff *nskb = *pp;
2449 
2450 		*pp = nskb->next;
2451 		nskb->next = NULL;
2452 		napi_gro_complete(nskb);
2453 		count--;
2454 	}
2455 
2456 	if (same_flow)
2457 		goto ok;
2458 
2459 	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
2460 		__skb_push(skb, -skb_network_offset(skb));
2461 		goto normal;
2462 	}
2463 
2464 	NAPI_GRO_CB(skb)->count = 1;
2465 	skb_shinfo(skb)->gso_size = skb->len;
2466 	skb->next = napi->gro_list;
2467 	napi->gro_list = skb;
2468 
2469 ok:
2470 	return free;
2471 
2472 normal:
2473 	return -1;
2474 }
2475 EXPORT_SYMBOL(dev_gro_receive);
2476 
2477 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2478 {
2479 	struct sk_buff *p;
2480 
2481 	for (p = napi->gro_list; p; p = p->next) {
2482 		NAPI_GRO_CB(p)->same_flow = 1;
2483 		NAPI_GRO_CB(p)->flush = 0;
2484 	}
2485 
2486 	return dev_gro_receive(napi, skb);
2487 }
2488 
2489 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2490 {
2491 	switch (__napi_gro_receive(napi, skb)) {
2492 	case -1:
2493 		return netif_receive_skb(skb);
2494 
2495 	case 1:
2496 		kfree_skb(skb);
2497 		break;
2498 	}
2499 
2500 	return NET_RX_SUCCESS;
2501 }
2502 EXPORT_SYMBOL(napi_gro_receive);
2503 
2504 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2505 {
2506 	__skb_pull(skb, skb_headlen(skb));
2507 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2508 
2509 	napi->skb = skb;
2510 }
2511 EXPORT_SYMBOL(napi_reuse_skb);
2512 
2513 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2514 				  struct napi_gro_fraginfo *info)
2515 {
2516 	struct net_device *dev = napi->dev;
2517 	struct sk_buff *skb = napi->skb;
2518 
2519 	napi->skb = NULL;
2520 
2521 	if (!skb) {
2522 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2523 		if (!skb)
2524 			goto out;
2525 
2526 		skb_reserve(skb, NET_IP_ALIGN);
2527 	}
2528 
2529 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2530 	skb_shinfo(skb)->nr_frags = info->nr_frags;
2531 	memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
2532 
2533 	skb->data_len = info->len;
2534 	skb->len += info->len;
2535 	skb->truesize += info->len;
2536 
2537 	if (!pskb_may_pull(skb, ETH_HLEN)) {
2538 		napi_reuse_skb(napi, skb);
2539 		skb = NULL;
2540 		goto out;
2541 	}
2542 
2543 	skb->protocol = eth_type_trans(skb, dev);
2544 
2545 	skb->ip_summed = info->ip_summed;
2546 	skb->csum = info->csum;
2547 
2548 out:
2549 	return skb;
2550 }
2551 EXPORT_SYMBOL(napi_fraginfo_skb);
2552 
2553 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2554 {
2555 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2556 	int err = NET_RX_DROP;
2557 
2558 	if (!skb)
2559 		goto out;
2560 
2561 	err = NET_RX_SUCCESS;
2562 
2563 	switch (__napi_gro_receive(napi, skb)) {
2564 	case -1:
2565 		return netif_receive_skb(skb);
2566 
2567 	case 0:
2568 		goto out;
2569 	}
2570 
2571 	napi_reuse_skb(napi, skb);
2572 
2573 out:
2574 	return err;
2575 }
2576 EXPORT_SYMBOL(napi_gro_frags);
2577 
2578 static int process_backlog(struct napi_struct *napi, int quota)
2579 {
2580 	int work = 0;
2581 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2582 	unsigned long start_time = jiffies;
2583 
2584 	napi->weight = weight_p;
2585 	do {
2586 		struct sk_buff *skb;
2587 
2588 		local_irq_disable();
2589 		skb = __skb_dequeue(&queue->input_pkt_queue);
2590 		if (!skb) {
2591 			__napi_complete(napi);
2592 			local_irq_enable();
2593 			break;
2594 		}
2595 		local_irq_enable();
2596 
2597 		napi_gro_receive(napi, skb);
2598 	} while (++work < quota && jiffies == start_time);
2599 
2600 	napi_gro_flush(napi);
2601 
2602 	return work;
2603 }
2604 
2605 /**
2606  * __napi_schedule - schedule for receive
2607  * @n: entry to schedule
2608  *
2609  * The entry's receive function will be scheduled to run
2610  */
2611 void __napi_schedule(struct napi_struct *n)
2612 {
2613 	unsigned long flags;
2614 
2615 	local_irq_save(flags);
2616 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2617 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2618 	local_irq_restore(flags);
2619 }
2620 EXPORT_SYMBOL(__napi_schedule);
2621 
2622 void __napi_complete(struct napi_struct *n)
2623 {
2624 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2625 	BUG_ON(n->gro_list);
2626 
2627 	list_del(&n->poll_list);
2628 	smp_mb__before_clear_bit();
2629 	clear_bit(NAPI_STATE_SCHED, &n->state);
2630 }
2631 EXPORT_SYMBOL(__napi_complete);
2632 
2633 void napi_complete(struct napi_struct *n)
2634 {
2635 	unsigned long flags;
2636 
2637 	/*
2638 	 * don't let napi dequeue from the cpu poll list
2639 	 * just in case its running on a different cpu
2640 	 */
2641 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2642 		return;
2643 
2644 	napi_gro_flush(n);
2645 	local_irq_save(flags);
2646 	__napi_complete(n);
2647 	local_irq_restore(flags);
2648 }
2649 EXPORT_SYMBOL(napi_complete);
2650 
2651 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2652 		    int (*poll)(struct napi_struct *, int), int weight)
2653 {
2654 	INIT_LIST_HEAD(&napi->poll_list);
2655 	napi->gro_list = NULL;
2656 	napi->skb = NULL;
2657 	napi->poll = poll;
2658 	napi->weight = weight;
2659 	list_add(&napi->dev_list, &dev->napi_list);
2660 	napi->dev = dev;
2661 #ifdef CONFIG_NETPOLL
2662 	spin_lock_init(&napi->poll_lock);
2663 	napi->poll_owner = -1;
2664 #endif
2665 	set_bit(NAPI_STATE_SCHED, &napi->state);
2666 }
2667 EXPORT_SYMBOL(netif_napi_add);
2668 
2669 void netif_napi_del(struct napi_struct *napi)
2670 {
2671 	struct sk_buff *skb, *next;
2672 
2673 	list_del_init(&napi->dev_list);
2674 	kfree(napi->skb);
2675 
2676 	for (skb = napi->gro_list; skb; skb = next) {
2677 		next = skb->next;
2678 		skb->next = NULL;
2679 		kfree_skb(skb);
2680 	}
2681 
2682 	napi->gro_list = NULL;
2683 }
2684 EXPORT_SYMBOL(netif_napi_del);
2685 
2686 
2687 static void net_rx_action(struct softirq_action *h)
2688 {
2689 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2690 	unsigned long time_limit = jiffies + 2;
2691 	int budget = netdev_budget;
2692 	void *have;
2693 
2694 	local_irq_disable();
2695 
2696 	while (!list_empty(list)) {
2697 		struct napi_struct *n;
2698 		int work, weight;
2699 
2700 		/* If softirq window is exhuasted then punt.
2701 		 * Allow this to run for 2 jiffies since which will allow
2702 		 * an average latency of 1.5/HZ.
2703 		 */
2704 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2705 			goto softnet_break;
2706 
2707 		local_irq_enable();
2708 
2709 		/* Even though interrupts have been re-enabled, this
2710 		 * access is safe because interrupts can only add new
2711 		 * entries to the tail of this list, and only ->poll()
2712 		 * calls can remove this head entry from the list.
2713 		 */
2714 		n = list_entry(list->next, struct napi_struct, poll_list);
2715 
2716 		have = netpoll_poll_lock(n);
2717 
2718 		weight = n->weight;
2719 
2720 		/* This NAPI_STATE_SCHED test is for avoiding a race
2721 		 * with netpoll's poll_napi().  Only the entity which
2722 		 * obtains the lock and sees NAPI_STATE_SCHED set will
2723 		 * actually make the ->poll() call.  Therefore we avoid
2724 		 * accidently calling ->poll() when NAPI is not scheduled.
2725 		 */
2726 		work = 0;
2727 		if (test_bit(NAPI_STATE_SCHED, &n->state))
2728 			work = n->poll(n, weight);
2729 
2730 		WARN_ON_ONCE(work > weight);
2731 
2732 		budget -= work;
2733 
2734 		local_irq_disable();
2735 
2736 		/* Drivers must not modify the NAPI state if they
2737 		 * consume the entire weight.  In such cases this code
2738 		 * still "owns" the NAPI instance and therefore can
2739 		 * move the instance around on the list at-will.
2740 		 */
2741 		if (unlikely(work == weight)) {
2742 			if (unlikely(napi_disable_pending(n)))
2743 				__napi_complete(n);
2744 			else
2745 				list_move_tail(&n->poll_list, list);
2746 		}
2747 
2748 		netpoll_poll_unlock(have);
2749 	}
2750 out:
2751 	local_irq_enable();
2752 
2753 #ifdef CONFIG_NET_DMA
2754 	/*
2755 	 * There may not be any more sk_buffs coming right now, so push
2756 	 * any pending DMA copies to hardware
2757 	 */
2758 	dma_issue_pending_all();
2759 #endif
2760 
2761 	return;
2762 
2763 softnet_break:
2764 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2765 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2766 	goto out;
2767 }
2768 
2769 static gifconf_func_t * gifconf_list [NPROTO];
2770 
2771 /**
2772  *	register_gifconf	-	register a SIOCGIF handler
2773  *	@family: Address family
2774  *	@gifconf: Function handler
2775  *
2776  *	Register protocol dependent address dumping routines. The handler
2777  *	that is passed must not be freed or reused until it has been replaced
2778  *	by another handler.
2779  */
2780 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2781 {
2782 	if (family >= NPROTO)
2783 		return -EINVAL;
2784 	gifconf_list[family] = gifconf;
2785 	return 0;
2786 }
2787 
2788 
2789 /*
2790  *	Map an interface index to its name (SIOCGIFNAME)
2791  */
2792 
2793 /*
2794  *	We need this ioctl for efficient implementation of the
2795  *	if_indextoname() function required by the IPv6 API.  Without
2796  *	it, we would have to search all the interfaces to find a
2797  *	match.  --pb
2798  */
2799 
2800 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2801 {
2802 	struct net_device *dev;
2803 	struct ifreq ifr;
2804 
2805 	/*
2806 	 *	Fetch the caller's info block.
2807 	 */
2808 
2809 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2810 		return -EFAULT;
2811 
2812 	read_lock(&dev_base_lock);
2813 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2814 	if (!dev) {
2815 		read_unlock(&dev_base_lock);
2816 		return -ENODEV;
2817 	}
2818 
2819 	strcpy(ifr.ifr_name, dev->name);
2820 	read_unlock(&dev_base_lock);
2821 
2822 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2823 		return -EFAULT;
2824 	return 0;
2825 }
2826 
2827 /*
2828  *	Perform a SIOCGIFCONF call. This structure will change
2829  *	size eventually, and there is nothing I can do about it.
2830  *	Thus we will need a 'compatibility mode'.
2831  */
2832 
2833 static int dev_ifconf(struct net *net, char __user *arg)
2834 {
2835 	struct ifconf ifc;
2836 	struct net_device *dev;
2837 	char __user *pos;
2838 	int len;
2839 	int total;
2840 	int i;
2841 
2842 	/*
2843 	 *	Fetch the caller's info block.
2844 	 */
2845 
2846 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2847 		return -EFAULT;
2848 
2849 	pos = ifc.ifc_buf;
2850 	len = ifc.ifc_len;
2851 
2852 	/*
2853 	 *	Loop over the interfaces, and write an info block for each.
2854 	 */
2855 
2856 	total = 0;
2857 	for_each_netdev(net, dev) {
2858 		for (i = 0; i < NPROTO; i++) {
2859 			if (gifconf_list[i]) {
2860 				int done;
2861 				if (!pos)
2862 					done = gifconf_list[i](dev, NULL, 0);
2863 				else
2864 					done = gifconf_list[i](dev, pos + total,
2865 							       len - total);
2866 				if (done < 0)
2867 					return -EFAULT;
2868 				total += done;
2869 			}
2870 		}
2871 	}
2872 
2873 	/*
2874 	 *	All done.  Write the updated control block back to the caller.
2875 	 */
2876 	ifc.ifc_len = total;
2877 
2878 	/*
2879 	 * 	Both BSD and Solaris return 0 here, so we do too.
2880 	 */
2881 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2882 }
2883 
2884 #ifdef CONFIG_PROC_FS
2885 /*
2886  *	This is invoked by the /proc filesystem handler to display a device
2887  *	in detail.
2888  */
2889 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2890 	__acquires(dev_base_lock)
2891 {
2892 	struct net *net = seq_file_net(seq);
2893 	loff_t off;
2894 	struct net_device *dev;
2895 
2896 	read_lock(&dev_base_lock);
2897 	if (!*pos)
2898 		return SEQ_START_TOKEN;
2899 
2900 	off = 1;
2901 	for_each_netdev(net, dev)
2902 		if (off++ == *pos)
2903 			return dev;
2904 
2905 	return NULL;
2906 }
2907 
2908 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2909 {
2910 	struct net *net = seq_file_net(seq);
2911 	++*pos;
2912 	return v == SEQ_START_TOKEN ?
2913 		first_net_device(net) : next_net_device((struct net_device *)v);
2914 }
2915 
2916 void dev_seq_stop(struct seq_file *seq, void *v)
2917 	__releases(dev_base_lock)
2918 {
2919 	read_unlock(&dev_base_lock);
2920 }
2921 
2922 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2923 {
2924 	const struct net_device_stats *stats = dev_get_stats(dev);
2925 
2926 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2927 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2928 		   dev->name, stats->rx_bytes, stats->rx_packets,
2929 		   stats->rx_errors,
2930 		   stats->rx_dropped + stats->rx_missed_errors,
2931 		   stats->rx_fifo_errors,
2932 		   stats->rx_length_errors + stats->rx_over_errors +
2933 		    stats->rx_crc_errors + stats->rx_frame_errors,
2934 		   stats->rx_compressed, stats->multicast,
2935 		   stats->tx_bytes, stats->tx_packets,
2936 		   stats->tx_errors, stats->tx_dropped,
2937 		   stats->tx_fifo_errors, stats->collisions,
2938 		   stats->tx_carrier_errors +
2939 		    stats->tx_aborted_errors +
2940 		    stats->tx_window_errors +
2941 		    stats->tx_heartbeat_errors,
2942 		   stats->tx_compressed);
2943 }
2944 
2945 /*
2946  *	Called from the PROCfs module. This now uses the new arbitrary sized
2947  *	/proc/net interface to create /proc/net/dev
2948  */
2949 static int dev_seq_show(struct seq_file *seq, void *v)
2950 {
2951 	if (v == SEQ_START_TOKEN)
2952 		seq_puts(seq, "Inter-|   Receive                            "
2953 			      "                    |  Transmit\n"
2954 			      " face |bytes    packets errs drop fifo frame "
2955 			      "compressed multicast|bytes    packets errs "
2956 			      "drop fifo colls carrier compressed\n");
2957 	else
2958 		dev_seq_printf_stats(seq, v);
2959 	return 0;
2960 }
2961 
2962 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2963 {
2964 	struct netif_rx_stats *rc = NULL;
2965 
2966 	while (*pos < nr_cpu_ids)
2967 		if (cpu_online(*pos)) {
2968 			rc = &per_cpu(netdev_rx_stat, *pos);
2969 			break;
2970 		} else
2971 			++*pos;
2972 	return rc;
2973 }
2974 
2975 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2976 {
2977 	return softnet_get_online(pos);
2978 }
2979 
2980 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2981 {
2982 	++*pos;
2983 	return softnet_get_online(pos);
2984 }
2985 
2986 static void softnet_seq_stop(struct seq_file *seq, void *v)
2987 {
2988 }
2989 
2990 static int softnet_seq_show(struct seq_file *seq, void *v)
2991 {
2992 	struct netif_rx_stats *s = v;
2993 
2994 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2995 		   s->total, s->dropped, s->time_squeeze, 0,
2996 		   0, 0, 0, 0, /* was fastroute */
2997 		   s->cpu_collision );
2998 	return 0;
2999 }
3000 
3001 static const struct seq_operations dev_seq_ops = {
3002 	.start = dev_seq_start,
3003 	.next  = dev_seq_next,
3004 	.stop  = dev_seq_stop,
3005 	.show  = dev_seq_show,
3006 };
3007 
3008 static int dev_seq_open(struct inode *inode, struct file *file)
3009 {
3010 	return seq_open_net(inode, file, &dev_seq_ops,
3011 			    sizeof(struct seq_net_private));
3012 }
3013 
3014 static const struct file_operations dev_seq_fops = {
3015 	.owner	 = THIS_MODULE,
3016 	.open    = dev_seq_open,
3017 	.read    = seq_read,
3018 	.llseek  = seq_lseek,
3019 	.release = seq_release_net,
3020 };
3021 
3022 static const struct seq_operations softnet_seq_ops = {
3023 	.start = softnet_seq_start,
3024 	.next  = softnet_seq_next,
3025 	.stop  = softnet_seq_stop,
3026 	.show  = softnet_seq_show,
3027 };
3028 
3029 static int softnet_seq_open(struct inode *inode, struct file *file)
3030 {
3031 	return seq_open(file, &softnet_seq_ops);
3032 }
3033 
3034 static const struct file_operations softnet_seq_fops = {
3035 	.owner	 = THIS_MODULE,
3036 	.open    = softnet_seq_open,
3037 	.read    = seq_read,
3038 	.llseek  = seq_lseek,
3039 	.release = seq_release,
3040 };
3041 
3042 static void *ptype_get_idx(loff_t pos)
3043 {
3044 	struct packet_type *pt = NULL;
3045 	loff_t i = 0;
3046 	int t;
3047 
3048 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3049 		if (i == pos)
3050 			return pt;
3051 		++i;
3052 	}
3053 
3054 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3055 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3056 			if (i == pos)
3057 				return pt;
3058 			++i;
3059 		}
3060 	}
3061 	return NULL;
3062 }
3063 
3064 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3065 	__acquires(RCU)
3066 {
3067 	rcu_read_lock();
3068 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3069 }
3070 
3071 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3072 {
3073 	struct packet_type *pt;
3074 	struct list_head *nxt;
3075 	int hash;
3076 
3077 	++*pos;
3078 	if (v == SEQ_START_TOKEN)
3079 		return ptype_get_idx(0);
3080 
3081 	pt = v;
3082 	nxt = pt->list.next;
3083 	if (pt->type == htons(ETH_P_ALL)) {
3084 		if (nxt != &ptype_all)
3085 			goto found;
3086 		hash = 0;
3087 		nxt = ptype_base[0].next;
3088 	} else
3089 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3090 
3091 	while (nxt == &ptype_base[hash]) {
3092 		if (++hash >= PTYPE_HASH_SIZE)
3093 			return NULL;
3094 		nxt = ptype_base[hash].next;
3095 	}
3096 found:
3097 	return list_entry(nxt, struct packet_type, list);
3098 }
3099 
3100 static void ptype_seq_stop(struct seq_file *seq, void *v)
3101 	__releases(RCU)
3102 {
3103 	rcu_read_unlock();
3104 }
3105 
3106 static int ptype_seq_show(struct seq_file *seq, void *v)
3107 {
3108 	struct packet_type *pt = v;
3109 
3110 	if (v == SEQ_START_TOKEN)
3111 		seq_puts(seq, "Type Device      Function\n");
3112 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3113 		if (pt->type == htons(ETH_P_ALL))
3114 			seq_puts(seq, "ALL ");
3115 		else
3116 			seq_printf(seq, "%04x", ntohs(pt->type));
3117 
3118 		seq_printf(seq, " %-8s %pF\n",
3119 			   pt->dev ? pt->dev->name : "", pt->func);
3120 	}
3121 
3122 	return 0;
3123 }
3124 
3125 static const struct seq_operations ptype_seq_ops = {
3126 	.start = ptype_seq_start,
3127 	.next  = ptype_seq_next,
3128 	.stop  = ptype_seq_stop,
3129 	.show  = ptype_seq_show,
3130 };
3131 
3132 static int ptype_seq_open(struct inode *inode, struct file *file)
3133 {
3134 	return seq_open_net(inode, file, &ptype_seq_ops,
3135 			sizeof(struct seq_net_private));
3136 }
3137 
3138 static const struct file_operations ptype_seq_fops = {
3139 	.owner	 = THIS_MODULE,
3140 	.open    = ptype_seq_open,
3141 	.read    = seq_read,
3142 	.llseek  = seq_lseek,
3143 	.release = seq_release_net,
3144 };
3145 
3146 
3147 static int __net_init dev_proc_net_init(struct net *net)
3148 {
3149 	int rc = -ENOMEM;
3150 
3151 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3152 		goto out;
3153 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3154 		goto out_dev;
3155 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3156 		goto out_softnet;
3157 
3158 	if (wext_proc_init(net))
3159 		goto out_ptype;
3160 	rc = 0;
3161 out:
3162 	return rc;
3163 out_ptype:
3164 	proc_net_remove(net, "ptype");
3165 out_softnet:
3166 	proc_net_remove(net, "softnet_stat");
3167 out_dev:
3168 	proc_net_remove(net, "dev");
3169 	goto out;
3170 }
3171 
3172 static void __net_exit dev_proc_net_exit(struct net *net)
3173 {
3174 	wext_proc_exit(net);
3175 
3176 	proc_net_remove(net, "ptype");
3177 	proc_net_remove(net, "softnet_stat");
3178 	proc_net_remove(net, "dev");
3179 }
3180 
3181 static struct pernet_operations __net_initdata dev_proc_ops = {
3182 	.init = dev_proc_net_init,
3183 	.exit = dev_proc_net_exit,
3184 };
3185 
3186 static int __init dev_proc_init(void)
3187 {
3188 	return register_pernet_subsys(&dev_proc_ops);
3189 }
3190 #else
3191 #define dev_proc_init() 0
3192 #endif	/* CONFIG_PROC_FS */
3193 
3194 
3195 /**
3196  *	netdev_set_master	-	set up master/slave pair
3197  *	@slave: slave device
3198  *	@master: new master device
3199  *
3200  *	Changes the master device of the slave. Pass %NULL to break the
3201  *	bonding. The caller must hold the RTNL semaphore. On a failure
3202  *	a negative errno code is returned. On success the reference counts
3203  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3204  *	function returns zero.
3205  */
3206 int netdev_set_master(struct net_device *slave, struct net_device *master)
3207 {
3208 	struct net_device *old = slave->master;
3209 
3210 	ASSERT_RTNL();
3211 
3212 	if (master) {
3213 		if (old)
3214 			return -EBUSY;
3215 		dev_hold(master);
3216 	}
3217 
3218 	slave->master = master;
3219 
3220 	synchronize_net();
3221 
3222 	if (old)
3223 		dev_put(old);
3224 
3225 	if (master)
3226 		slave->flags |= IFF_SLAVE;
3227 	else
3228 		slave->flags &= ~IFF_SLAVE;
3229 
3230 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3231 	return 0;
3232 }
3233 
3234 static void dev_change_rx_flags(struct net_device *dev, int flags)
3235 {
3236 	const struct net_device_ops *ops = dev->netdev_ops;
3237 
3238 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3239 		ops->ndo_change_rx_flags(dev, flags);
3240 }
3241 
3242 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3243 {
3244 	unsigned short old_flags = dev->flags;
3245 	uid_t uid;
3246 	gid_t gid;
3247 
3248 	ASSERT_RTNL();
3249 
3250 	dev->flags |= IFF_PROMISC;
3251 	dev->promiscuity += inc;
3252 	if (dev->promiscuity == 0) {
3253 		/*
3254 		 * Avoid overflow.
3255 		 * If inc causes overflow, untouch promisc and return error.
3256 		 */
3257 		if (inc < 0)
3258 			dev->flags &= ~IFF_PROMISC;
3259 		else {
3260 			dev->promiscuity -= inc;
3261 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3262 				"set promiscuity failed, promiscuity feature "
3263 				"of device might be broken.\n", dev->name);
3264 			return -EOVERFLOW;
3265 		}
3266 	}
3267 	if (dev->flags != old_flags) {
3268 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3269 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3270 							       "left");
3271 		if (audit_enabled) {
3272 			current_uid_gid(&uid, &gid);
3273 			audit_log(current->audit_context, GFP_ATOMIC,
3274 				AUDIT_ANOM_PROMISCUOUS,
3275 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3276 				dev->name, (dev->flags & IFF_PROMISC),
3277 				(old_flags & IFF_PROMISC),
3278 				audit_get_loginuid(current),
3279 				uid, gid,
3280 				audit_get_sessionid(current));
3281 		}
3282 
3283 		dev_change_rx_flags(dev, IFF_PROMISC);
3284 	}
3285 	return 0;
3286 }
3287 
3288 /**
3289  *	dev_set_promiscuity	- update promiscuity count on a device
3290  *	@dev: device
3291  *	@inc: modifier
3292  *
3293  *	Add or remove promiscuity from a device. While the count in the device
3294  *	remains above zero the interface remains promiscuous. Once it hits zero
3295  *	the device reverts back to normal filtering operation. A negative inc
3296  *	value is used to drop promiscuity on the device.
3297  *	Return 0 if successful or a negative errno code on error.
3298  */
3299 int dev_set_promiscuity(struct net_device *dev, int inc)
3300 {
3301 	unsigned short old_flags = dev->flags;
3302 	int err;
3303 
3304 	err = __dev_set_promiscuity(dev, inc);
3305 	if (err < 0)
3306 		return err;
3307 	if (dev->flags != old_flags)
3308 		dev_set_rx_mode(dev);
3309 	return err;
3310 }
3311 
3312 /**
3313  *	dev_set_allmulti	- update allmulti count on a device
3314  *	@dev: device
3315  *	@inc: modifier
3316  *
3317  *	Add or remove reception of all multicast frames to a device. While the
3318  *	count in the device remains above zero the interface remains listening
3319  *	to all interfaces. Once it hits zero the device reverts back to normal
3320  *	filtering operation. A negative @inc value is used to drop the counter
3321  *	when releasing a resource needing all multicasts.
3322  *	Return 0 if successful or a negative errno code on error.
3323  */
3324 
3325 int dev_set_allmulti(struct net_device *dev, int inc)
3326 {
3327 	unsigned short old_flags = dev->flags;
3328 
3329 	ASSERT_RTNL();
3330 
3331 	dev->flags |= IFF_ALLMULTI;
3332 	dev->allmulti += inc;
3333 	if (dev->allmulti == 0) {
3334 		/*
3335 		 * Avoid overflow.
3336 		 * If inc causes overflow, untouch allmulti and return error.
3337 		 */
3338 		if (inc < 0)
3339 			dev->flags &= ~IFF_ALLMULTI;
3340 		else {
3341 			dev->allmulti -= inc;
3342 			printk(KERN_WARNING "%s: allmulti touches roof, "
3343 				"set allmulti failed, allmulti feature of "
3344 				"device might be broken.\n", dev->name);
3345 			return -EOVERFLOW;
3346 		}
3347 	}
3348 	if (dev->flags ^ old_flags) {
3349 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3350 		dev_set_rx_mode(dev);
3351 	}
3352 	return 0;
3353 }
3354 
3355 /*
3356  *	Upload unicast and multicast address lists to device and
3357  *	configure RX filtering. When the device doesn't support unicast
3358  *	filtering it is put in promiscuous mode while unicast addresses
3359  *	are present.
3360  */
3361 void __dev_set_rx_mode(struct net_device *dev)
3362 {
3363 	const struct net_device_ops *ops = dev->netdev_ops;
3364 
3365 	/* dev_open will call this function so the list will stay sane. */
3366 	if (!(dev->flags&IFF_UP))
3367 		return;
3368 
3369 	if (!netif_device_present(dev))
3370 		return;
3371 
3372 	if (ops->ndo_set_rx_mode)
3373 		ops->ndo_set_rx_mode(dev);
3374 	else {
3375 		/* Unicast addresses changes may only happen under the rtnl,
3376 		 * therefore calling __dev_set_promiscuity here is safe.
3377 		 */
3378 		if (dev->uc_count > 0 && !dev->uc_promisc) {
3379 			__dev_set_promiscuity(dev, 1);
3380 			dev->uc_promisc = 1;
3381 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3382 			__dev_set_promiscuity(dev, -1);
3383 			dev->uc_promisc = 0;
3384 		}
3385 
3386 		if (ops->ndo_set_multicast_list)
3387 			ops->ndo_set_multicast_list(dev);
3388 	}
3389 }
3390 
3391 void dev_set_rx_mode(struct net_device *dev)
3392 {
3393 	netif_addr_lock_bh(dev);
3394 	__dev_set_rx_mode(dev);
3395 	netif_addr_unlock_bh(dev);
3396 }
3397 
3398 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3399 		      void *addr, int alen, int glbl)
3400 {
3401 	struct dev_addr_list *da;
3402 
3403 	for (; (da = *list) != NULL; list = &da->next) {
3404 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3405 		    alen == da->da_addrlen) {
3406 			if (glbl) {
3407 				int old_glbl = da->da_gusers;
3408 				da->da_gusers = 0;
3409 				if (old_glbl == 0)
3410 					break;
3411 			}
3412 			if (--da->da_users)
3413 				return 0;
3414 
3415 			*list = da->next;
3416 			kfree(da);
3417 			(*count)--;
3418 			return 0;
3419 		}
3420 	}
3421 	return -ENOENT;
3422 }
3423 
3424 int __dev_addr_add(struct dev_addr_list **list, int *count,
3425 		   void *addr, int alen, int glbl)
3426 {
3427 	struct dev_addr_list *da;
3428 
3429 	for (da = *list; da != NULL; da = da->next) {
3430 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3431 		    da->da_addrlen == alen) {
3432 			if (glbl) {
3433 				int old_glbl = da->da_gusers;
3434 				da->da_gusers = 1;
3435 				if (old_glbl)
3436 					return 0;
3437 			}
3438 			da->da_users++;
3439 			return 0;
3440 		}
3441 	}
3442 
3443 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3444 	if (da == NULL)
3445 		return -ENOMEM;
3446 	memcpy(da->da_addr, addr, alen);
3447 	da->da_addrlen = alen;
3448 	da->da_users = 1;
3449 	da->da_gusers = glbl ? 1 : 0;
3450 	da->next = *list;
3451 	*list = da;
3452 	(*count)++;
3453 	return 0;
3454 }
3455 
3456 /**
3457  *	dev_unicast_delete	- Release secondary unicast address.
3458  *	@dev: device
3459  *	@addr: address to delete
3460  *	@alen: length of @addr
3461  *
3462  *	Release reference to a secondary unicast address and remove it
3463  *	from the device if the reference count drops to zero.
3464  *
3465  * 	The caller must hold the rtnl_mutex.
3466  */
3467 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3468 {
3469 	int err;
3470 
3471 	ASSERT_RTNL();
3472 
3473 	netif_addr_lock_bh(dev);
3474 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3475 	if (!err)
3476 		__dev_set_rx_mode(dev);
3477 	netif_addr_unlock_bh(dev);
3478 	return err;
3479 }
3480 EXPORT_SYMBOL(dev_unicast_delete);
3481 
3482 /**
3483  *	dev_unicast_add		- add a secondary unicast address
3484  *	@dev: device
3485  *	@addr: address to add
3486  *	@alen: length of @addr
3487  *
3488  *	Add a secondary unicast address to the device or increase
3489  *	the reference count if it already exists.
3490  *
3491  *	The caller must hold the rtnl_mutex.
3492  */
3493 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3494 {
3495 	int err;
3496 
3497 	ASSERT_RTNL();
3498 
3499 	netif_addr_lock_bh(dev);
3500 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3501 	if (!err)
3502 		__dev_set_rx_mode(dev);
3503 	netif_addr_unlock_bh(dev);
3504 	return err;
3505 }
3506 EXPORT_SYMBOL(dev_unicast_add);
3507 
3508 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3509 		    struct dev_addr_list **from, int *from_count)
3510 {
3511 	struct dev_addr_list *da, *next;
3512 	int err = 0;
3513 
3514 	da = *from;
3515 	while (da != NULL) {
3516 		next = da->next;
3517 		if (!da->da_synced) {
3518 			err = __dev_addr_add(to, to_count,
3519 					     da->da_addr, da->da_addrlen, 0);
3520 			if (err < 0)
3521 				break;
3522 			da->da_synced = 1;
3523 			da->da_users++;
3524 		} else if (da->da_users == 1) {
3525 			__dev_addr_delete(to, to_count,
3526 					  da->da_addr, da->da_addrlen, 0);
3527 			__dev_addr_delete(from, from_count,
3528 					  da->da_addr, da->da_addrlen, 0);
3529 		}
3530 		da = next;
3531 	}
3532 	return err;
3533 }
3534 
3535 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3536 		       struct dev_addr_list **from, int *from_count)
3537 {
3538 	struct dev_addr_list *da, *next;
3539 
3540 	da = *from;
3541 	while (da != NULL) {
3542 		next = da->next;
3543 		if (da->da_synced) {
3544 			__dev_addr_delete(to, to_count,
3545 					  da->da_addr, da->da_addrlen, 0);
3546 			da->da_synced = 0;
3547 			__dev_addr_delete(from, from_count,
3548 					  da->da_addr, da->da_addrlen, 0);
3549 		}
3550 		da = next;
3551 	}
3552 }
3553 
3554 /**
3555  *	dev_unicast_sync - Synchronize device's unicast list to another device
3556  *	@to: destination device
3557  *	@from: source device
3558  *
3559  *	Add newly added addresses to the destination device and release
3560  *	addresses that have no users left. The source device must be
3561  *	locked by netif_tx_lock_bh.
3562  *
3563  *	This function is intended to be called from the dev->set_rx_mode
3564  *	function of layered software devices.
3565  */
3566 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3567 {
3568 	int err = 0;
3569 
3570 	netif_addr_lock_bh(to);
3571 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3572 			      &from->uc_list, &from->uc_count);
3573 	if (!err)
3574 		__dev_set_rx_mode(to);
3575 	netif_addr_unlock_bh(to);
3576 	return err;
3577 }
3578 EXPORT_SYMBOL(dev_unicast_sync);
3579 
3580 /**
3581  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3582  *	@to: destination device
3583  *	@from: source device
3584  *
3585  *	Remove all addresses that were added to the destination device by
3586  *	dev_unicast_sync(). This function is intended to be called from the
3587  *	dev->stop function of layered software devices.
3588  */
3589 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3590 {
3591 	netif_addr_lock_bh(from);
3592 	netif_addr_lock(to);
3593 
3594 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3595 			  &from->uc_list, &from->uc_count);
3596 	__dev_set_rx_mode(to);
3597 
3598 	netif_addr_unlock(to);
3599 	netif_addr_unlock_bh(from);
3600 }
3601 EXPORT_SYMBOL(dev_unicast_unsync);
3602 
3603 static void __dev_addr_discard(struct dev_addr_list **list)
3604 {
3605 	struct dev_addr_list *tmp;
3606 
3607 	while (*list != NULL) {
3608 		tmp = *list;
3609 		*list = tmp->next;
3610 		if (tmp->da_users > tmp->da_gusers)
3611 			printk("__dev_addr_discard: address leakage! "
3612 			       "da_users=%d\n", tmp->da_users);
3613 		kfree(tmp);
3614 	}
3615 }
3616 
3617 static void dev_addr_discard(struct net_device *dev)
3618 {
3619 	netif_addr_lock_bh(dev);
3620 
3621 	__dev_addr_discard(&dev->uc_list);
3622 	dev->uc_count = 0;
3623 
3624 	__dev_addr_discard(&dev->mc_list);
3625 	dev->mc_count = 0;
3626 
3627 	netif_addr_unlock_bh(dev);
3628 }
3629 
3630 /**
3631  *	dev_get_flags - get flags reported to userspace
3632  *	@dev: device
3633  *
3634  *	Get the combination of flag bits exported through APIs to userspace.
3635  */
3636 unsigned dev_get_flags(const struct net_device *dev)
3637 {
3638 	unsigned flags;
3639 
3640 	flags = (dev->flags & ~(IFF_PROMISC |
3641 				IFF_ALLMULTI |
3642 				IFF_RUNNING |
3643 				IFF_LOWER_UP |
3644 				IFF_DORMANT)) |
3645 		(dev->gflags & (IFF_PROMISC |
3646 				IFF_ALLMULTI));
3647 
3648 	if (netif_running(dev)) {
3649 		if (netif_oper_up(dev))
3650 			flags |= IFF_RUNNING;
3651 		if (netif_carrier_ok(dev))
3652 			flags |= IFF_LOWER_UP;
3653 		if (netif_dormant(dev))
3654 			flags |= IFF_DORMANT;
3655 	}
3656 
3657 	return flags;
3658 }
3659 
3660 /**
3661  *	dev_change_flags - change device settings
3662  *	@dev: device
3663  *	@flags: device state flags
3664  *
3665  *	Change settings on device based state flags. The flags are
3666  *	in the userspace exported format.
3667  */
3668 int dev_change_flags(struct net_device *dev, unsigned flags)
3669 {
3670 	int ret, changes;
3671 	int old_flags = dev->flags;
3672 
3673 	ASSERT_RTNL();
3674 
3675 	/*
3676 	 *	Set the flags on our device.
3677 	 */
3678 
3679 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3680 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3681 			       IFF_AUTOMEDIA)) |
3682 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3683 				    IFF_ALLMULTI));
3684 
3685 	/*
3686 	 *	Load in the correct multicast list now the flags have changed.
3687 	 */
3688 
3689 	if ((old_flags ^ flags) & IFF_MULTICAST)
3690 		dev_change_rx_flags(dev, IFF_MULTICAST);
3691 
3692 	dev_set_rx_mode(dev);
3693 
3694 	/*
3695 	 *	Have we downed the interface. We handle IFF_UP ourselves
3696 	 *	according to user attempts to set it, rather than blindly
3697 	 *	setting it.
3698 	 */
3699 
3700 	ret = 0;
3701 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3702 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3703 
3704 		if (!ret)
3705 			dev_set_rx_mode(dev);
3706 	}
3707 
3708 	if (dev->flags & IFF_UP &&
3709 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3710 					  IFF_VOLATILE)))
3711 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3712 
3713 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3714 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3715 		dev->gflags ^= IFF_PROMISC;
3716 		dev_set_promiscuity(dev, inc);
3717 	}
3718 
3719 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3720 	   is important. Some (broken) drivers set IFF_PROMISC, when
3721 	   IFF_ALLMULTI is requested not asking us and not reporting.
3722 	 */
3723 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3724 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3725 		dev->gflags ^= IFF_ALLMULTI;
3726 		dev_set_allmulti(dev, inc);
3727 	}
3728 
3729 	/* Exclude state transition flags, already notified */
3730 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3731 	if (changes)
3732 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3733 
3734 	return ret;
3735 }
3736 
3737 /**
3738  *	dev_set_mtu - Change maximum transfer unit
3739  *	@dev: device
3740  *	@new_mtu: new transfer unit
3741  *
3742  *	Change the maximum transfer size of the network device.
3743  */
3744 int dev_set_mtu(struct net_device *dev, int new_mtu)
3745 {
3746 	const struct net_device_ops *ops = dev->netdev_ops;
3747 	int err;
3748 
3749 	if (new_mtu == dev->mtu)
3750 		return 0;
3751 
3752 	/*	MTU must be positive.	 */
3753 	if (new_mtu < 0)
3754 		return -EINVAL;
3755 
3756 	if (!netif_device_present(dev))
3757 		return -ENODEV;
3758 
3759 	err = 0;
3760 	if (ops->ndo_change_mtu)
3761 		err = ops->ndo_change_mtu(dev, new_mtu);
3762 	else
3763 		dev->mtu = new_mtu;
3764 
3765 	if (!err && dev->flags & IFF_UP)
3766 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3767 	return err;
3768 }
3769 
3770 /**
3771  *	dev_set_mac_address - Change Media Access Control Address
3772  *	@dev: device
3773  *	@sa: new address
3774  *
3775  *	Change the hardware (MAC) address of the device
3776  */
3777 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3778 {
3779 	const struct net_device_ops *ops = dev->netdev_ops;
3780 	int err;
3781 
3782 	if (!ops->ndo_set_mac_address)
3783 		return -EOPNOTSUPP;
3784 	if (sa->sa_family != dev->type)
3785 		return -EINVAL;
3786 	if (!netif_device_present(dev))
3787 		return -ENODEV;
3788 	err = ops->ndo_set_mac_address(dev, sa);
3789 	if (!err)
3790 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3791 	return err;
3792 }
3793 
3794 /*
3795  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3796  */
3797 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3798 {
3799 	int err;
3800 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3801 
3802 	if (!dev)
3803 		return -ENODEV;
3804 
3805 	switch (cmd) {
3806 		case SIOCGIFFLAGS:	/* Get interface flags */
3807 			ifr->ifr_flags = dev_get_flags(dev);
3808 			return 0;
3809 
3810 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3811 					   (currently unused) */
3812 			ifr->ifr_metric = 0;
3813 			return 0;
3814 
3815 		case SIOCGIFMTU:	/* Get the MTU of a device */
3816 			ifr->ifr_mtu = dev->mtu;
3817 			return 0;
3818 
3819 		case SIOCGIFHWADDR:
3820 			if (!dev->addr_len)
3821 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3822 			else
3823 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3824 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3825 			ifr->ifr_hwaddr.sa_family = dev->type;
3826 			return 0;
3827 
3828 		case SIOCGIFSLAVE:
3829 			err = -EINVAL;
3830 			break;
3831 
3832 		case SIOCGIFMAP:
3833 			ifr->ifr_map.mem_start = dev->mem_start;
3834 			ifr->ifr_map.mem_end   = dev->mem_end;
3835 			ifr->ifr_map.base_addr = dev->base_addr;
3836 			ifr->ifr_map.irq       = dev->irq;
3837 			ifr->ifr_map.dma       = dev->dma;
3838 			ifr->ifr_map.port      = dev->if_port;
3839 			return 0;
3840 
3841 		case SIOCGIFINDEX:
3842 			ifr->ifr_ifindex = dev->ifindex;
3843 			return 0;
3844 
3845 		case SIOCGIFTXQLEN:
3846 			ifr->ifr_qlen = dev->tx_queue_len;
3847 			return 0;
3848 
3849 		default:
3850 			/* dev_ioctl() should ensure this case
3851 			 * is never reached
3852 			 */
3853 			WARN_ON(1);
3854 			err = -EINVAL;
3855 			break;
3856 
3857 	}
3858 	return err;
3859 }
3860 
3861 /*
3862  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3863  */
3864 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3865 {
3866 	int err;
3867 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3868 	const struct net_device_ops *ops;
3869 
3870 	if (!dev)
3871 		return -ENODEV;
3872 
3873 	ops = dev->netdev_ops;
3874 
3875 	switch (cmd) {
3876 		case SIOCSIFFLAGS:	/* Set interface flags */
3877 			return dev_change_flags(dev, ifr->ifr_flags);
3878 
3879 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3880 					   (currently unused) */
3881 			return -EOPNOTSUPP;
3882 
3883 		case SIOCSIFMTU:	/* Set the MTU of a device */
3884 			return dev_set_mtu(dev, ifr->ifr_mtu);
3885 
3886 		case SIOCSIFHWADDR:
3887 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3888 
3889 		case SIOCSIFHWBROADCAST:
3890 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3891 				return -EINVAL;
3892 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3893 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3894 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3895 			return 0;
3896 
3897 		case SIOCSIFMAP:
3898 			if (ops->ndo_set_config) {
3899 				if (!netif_device_present(dev))
3900 					return -ENODEV;
3901 				return ops->ndo_set_config(dev, &ifr->ifr_map);
3902 			}
3903 			return -EOPNOTSUPP;
3904 
3905 		case SIOCADDMULTI:
3906 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3907 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3908 				return -EINVAL;
3909 			if (!netif_device_present(dev))
3910 				return -ENODEV;
3911 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3912 					  dev->addr_len, 1);
3913 
3914 		case SIOCDELMULTI:
3915 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3916 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3917 				return -EINVAL;
3918 			if (!netif_device_present(dev))
3919 				return -ENODEV;
3920 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3921 					     dev->addr_len, 1);
3922 
3923 		case SIOCSIFTXQLEN:
3924 			if (ifr->ifr_qlen < 0)
3925 				return -EINVAL;
3926 			dev->tx_queue_len = ifr->ifr_qlen;
3927 			return 0;
3928 
3929 		case SIOCSIFNAME:
3930 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3931 			return dev_change_name(dev, ifr->ifr_newname);
3932 
3933 		/*
3934 		 *	Unknown or private ioctl
3935 		 */
3936 
3937 		default:
3938 			if ((cmd >= SIOCDEVPRIVATE &&
3939 			    cmd <= SIOCDEVPRIVATE + 15) ||
3940 			    cmd == SIOCBONDENSLAVE ||
3941 			    cmd == SIOCBONDRELEASE ||
3942 			    cmd == SIOCBONDSETHWADDR ||
3943 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3944 			    cmd == SIOCBONDINFOQUERY ||
3945 			    cmd == SIOCBONDCHANGEACTIVE ||
3946 			    cmd == SIOCGMIIPHY ||
3947 			    cmd == SIOCGMIIREG ||
3948 			    cmd == SIOCSMIIREG ||
3949 			    cmd == SIOCBRADDIF ||
3950 			    cmd == SIOCBRDELIF ||
3951 			    cmd == SIOCWANDEV) {
3952 				err = -EOPNOTSUPP;
3953 				if (ops->ndo_do_ioctl) {
3954 					if (netif_device_present(dev))
3955 						err = ops->ndo_do_ioctl(dev, ifr, cmd);
3956 					else
3957 						err = -ENODEV;
3958 				}
3959 			} else
3960 				err = -EINVAL;
3961 
3962 	}
3963 	return err;
3964 }
3965 
3966 /*
3967  *	This function handles all "interface"-type I/O control requests. The actual
3968  *	'doing' part of this is dev_ifsioc above.
3969  */
3970 
3971 /**
3972  *	dev_ioctl	-	network device ioctl
3973  *	@net: the applicable net namespace
3974  *	@cmd: command to issue
3975  *	@arg: pointer to a struct ifreq in user space
3976  *
3977  *	Issue ioctl functions to devices. This is normally called by the
3978  *	user space syscall interfaces but can sometimes be useful for
3979  *	other purposes. The return value is the return from the syscall if
3980  *	positive or a negative errno code on error.
3981  */
3982 
3983 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3984 {
3985 	struct ifreq ifr;
3986 	int ret;
3987 	char *colon;
3988 
3989 	/* One special case: SIOCGIFCONF takes ifconf argument
3990 	   and requires shared lock, because it sleeps writing
3991 	   to user space.
3992 	 */
3993 
3994 	if (cmd == SIOCGIFCONF) {
3995 		rtnl_lock();
3996 		ret = dev_ifconf(net, (char __user *) arg);
3997 		rtnl_unlock();
3998 		return ret;
3999 	}
4000 	if (cmd == SIOCGIFNAME)
4001 		return dev_ifname(net, (struct ifreq __user *)arg);
4002 
4003 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4004 		return -EFAULT;
4005 
4006 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4007 
4008 	colon = strchr(ifr.ifr_name, ':');
4009 	if (colon)
4010 		*colon = 0;
4011 
4012 	/*
4013 	 *	See which interface the caller is talking about.
4014 	 */
4015 
4016 	switch (cmd) {
4017 		/*
4018 		 *	These ioctl calls:
4019 		 *	- can be done by all.
4020 		 *	- atomic and do not require locking.
4021 		 *	- return a value
4022 		 */
4023 		case SIOCGIFFLAGS:
4024 		case SIOCGIFMETRIC:
4025 		case SIOCGIFMTU:
4026 		case SIOCGIFHWADDR:
4027 		case SIOCGIFSLAVE:
4028 		case SIOCGIFMAP:
4029 		case SIOCGIFINDEX:
4030 		case SIOCGIFTXQLEN:
4031 			dev_load(net, ifr.ifr_name);
4032 			read_lock(&dev_base_lock);
4033 			ret = dev_ifsioc_locked(net, &ifr, cmd);
4034 			read_unlock(&dev_base_lock);
4035 			if (!ret) {
4036 				if (colon)
4037 					*colon = ':';
4038 				if (copy_to_user(arg, &ifr,
4039 						 sizeof(struct ifreq)))
4040 					ret = -EFAULT;
4041 			}
4042 			return ret;
4043 
4044 		case SIOCETHTOOL:
4045 			dev_load(net, ifr.ifr_name);
4046 			rtnl_lock();
4047 			ret = dev_ethtool(net, &ifr);
4048 			rtnl_unlock();
4049 			if (!ret) {
4050 				if (colon)
4051 					*colon = ':';
4052 				if (copy_to_user(arg, &ifr,
4053 						 sizeof(struct ifreq)))
4054 					ret = -EFAULT;
4055 			}
4056 			return ret;
4057 
4058 		/*
4059 		 *	These ioctl calls:
4060 		 *	- require superuser power.
4061 		 *	- require strict serialization.
4062 		 *	- return a value
4063 		 */
4064 		case SIOCGMIIPHY:
4065 		case SIOCGMIIREG:
4066 		case SIOCSIFNAME:
4067 			if (!capable(CAP_NET_ADMIN))
4068 				return -EPERM;
4069 			dev_load(net, ifr.ifr_name);
4070 			rtnl_lock();
4071 			ret = dev_ifsioc(net, &ifr, cmd);
4072 			rtnl_unlock();
4073 			if (!ret) {
4074 				if (colon)
4075 					*colon = ':';
4076 				if (copy_to_user(arg, &ifr,
4077 						 sizeof(struct ifreq)))
4078 					ret = -EFAULT;
4079 			}
4080 			return ret;
4081 
4082 		/*
4083 		 *	These ioctl calls:
4084 		 *	- require superuser power.
4085 		 *	- require strict serialization.
4086 		 *	- do not return a value
4087 		 */
4088 		case SIOCSIFFLAGS:
4089 		case SIOCSIFMETRIC:
4090 		case SIOCSIFMTU:
4091 		case SIOCSIFMAP:
4092 		case SIOCSIFHWADDR:
4093 		case SIOCSIFSLAVE:
4094 		case SIOCADDMULTI:
4095 		case SIOCDELMULTI:
4096 		case SIOCSIFHWBROADCAST:
4097 		case SIOCSIFTXQLEN:
4098 		case SIOCSMIIREG:
4099 		case SIOCBONDENSLAVE:
4100 		case SIOCBONDRELEASE:
4101 		case SIOCBONDSETHWADDR:
4102 		case SIOCBONDCHANGEACTIVE:
4103 		case SIOCBRADDIF:
4104 		case SIOCBRDELIF:
4105 			if (!capable(CAP_NET_ADMIN))
4106 				return -EPERM;
4107 			/* fall through */
4108 		case SIOCBONDSLAVEINFOQUERY:
4109 		case SIOCBONDINFOQUERY:
4110 			dev_load(net, ifr.ifr_name);
4111 			rtnl_lock();
4112 			ret = dev_ifsioc(net, &ifr, cmd);
4113 			rtnl_unlock();
4114 			return ret;
4115 
4116 		case SIOCGIFMEM:
4117 			/* Get the per device memory space. We can add this but
4118 			 * currently do not support it */
4119 		case SIOCSIFMEM:
4120 			/* Set the per device memory buffer space.
4121 			 * Not applicable in our case */
4122 		case SIOCSIFLINK:
4123 			return -EINVAL;
4124 
4125 		/*
4126 		 *	Unknown or private ioctl.
4127 		 */
4128 		default:
4129 			if (cmd == SIOCWANDEV ||
4130 			    (cmd >= SIOCDEVPRIVATE &&
4131 			     cmd <= SIOCDEVPRIVATE + 15)) {
4132 				dev_load(net, ifr.ifr_name);
4133 				rtnl_lock();
4134 				ret = dev_ifsioc(net, &ifr, cmd);
4135 				rtnl_unlock();
4136 				if (!ret && copy_to_user(arg, &ifr,
4137 							 sizeof(struct ifreq)))
4138 					ret = -EFAULT;
4139 				return ret;
4140 			}
4141 			/* Take care of Wireless Extensions */
4142 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4143 				return wext_handle_ioctl(net, &ifr, cmd, arg);
4144 			return -EINVAL;
4145 	}
4146 }
4147 
4148 
4149 /**
4150  *	dev_new_index	-	allocate an ifindex
4151  *	@net: the applicable net namespace
4152  *
4153  *	Returns a suitable unique value for a new device interface
4154  *	number.  The caller must hold the rtnl semaphore or the
4155  *	dev_base_lock to be sure it remains unique.
4156  */
4157 static int dev_new_index(struct net *net)
4158 {
4159 	static int ifindex;
4160 	for (;;) {
4161 		if (++ifindex <= 0)
4162 			ifindex = 1;
4163 		if (!__dev_get_by_index(net, ifindex))
4164 			return ifindex;
4165 	}
4166 }
4167 
4168 /* Delayed registration/unregisteration */
4169 static LIST_HEAD(net_todo_list);
4170 
4171 static void net_set_todo(struct net_device *dev)
4172 {
4173 	list_add_tail(&dev->todo_list, &net_todo_list);
4174 }
4175 
4176 static void rollback_registered(struct net_device *dev)
4177 {
4178 	BUG_ON(dev_boot_phase);
4179 	ASSERT_RTNL();
4180 
4181 	/* Some devices call without registering for initialization unwind. */
4182 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4183 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4184 				  "was registered\n", dev->name, dev);
4185 
4186 		WARN_ON(1);
4187 		return;
4188 	}
4189 
4190 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4191 
4192 	/* If device is running, close it first. */
4193 	dev_close(dev);
4194 
4195 	/* And unlink it from device chain. */
4196 	unlist_netdevice(dev);
4197 
4198 	dev->reg_state = NETREG_UNREGISTERING;
4199 
4200 	synchronize_net();
4201 
4202 	/* Shutdown queueing discipline. */
4203 	dev_shutdown(dev);
4204 
4205 
4206 	/* Notify protocols, that we are about to destroy
4207 	   this device. They should clean all the things.
4208 	*/
4209 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4210 
4211 	/*
4212 	 *	Flush the unicast and multicast chains
4213 	 */
4214 	dev_addr_discard(dev);
4215 
4216 	if (dev->netdev_ops->ndo_uninit)
4217 		dev->netdev_ops->ndo_uninit(dev);
4218 
4219 	/* Notifier chain MUST detach us from master device. */
4220 	WARN_ON(dev->master);
4221 
4222 	/* Remove entries from kobject tree */
4223 	netdev_unregister_kobject(dev);
4224 
4225 	synchronize_net();
4226 
4227 	dev_put(dev);
4228 }
4229 
4230 static void __netdev_init_queue_locks_one(struct net_device *dev,
4231 					  struct netdev_queue *dev_queue,
4232 					  void *_unused)
4233 {
4234 	spin_lock_init(&dev_queue->_xmit_lock);
4235 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4236 	dev_queue->xmit_lock_owner = -1;
4237 }
4238 
4239 static void netdev_init_queue_locks(struct net_device *dev)
4240 {
4241 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4242 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4243 }
4244 
4245 unsigned long netdev_fix_features(unsigned long features, const char *name)
4246 {
4247 	/* Fix illegal SG+CSUM combinations. */
4248 	if ((features & NETIF_F_SG) &&
4249 	    !(features & NETIF_F_ALL_CSUM)) {
4250 		if (name)
4251 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4252 			       "checksum feature.\n", name);
4253 		features &= ~NETIF_F_SG;
4254 	}
4255 
4256 	/* TSO requires that SG is present as well. */
4257 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4258 		if (name)
4259 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4260 			       "SG feature.\n", name);
4261 		features &= ~NETIF_F_TSO;
4262 	}
4263 
4264 	if (features & NETIF_F_UFO) {
4265 		if (!(features & NETIF_F_GEN_CSUM)) {
4266 			if (name)
4267 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4268 				       "since no NETIF_F_HW_CSUM feature.\n",
4269 				       name);
4270 			features &= ~NETIF_F_UFO;
4271 		}
4272 
4273 		if (!(features & NETIF_F_SG)) {
4274 			if (name)
4275 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4276 				       "since no NETIF_F_SG feature.\n", name);
4277 			features &= ~NETIF_F_UFO;
4278 		}
4279 	}
4280 
4281 	return features;
4282 }
4283 EXPORT_SYMBOL(netdev_fix_features);
4284 
4285 /**
4286  *	register_netdevice	- register a network device
4287  *	@dev: device to register
4288  *
4289  *	Take a completed network device structure and add it to the kernel
4290  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4291  *	chain. 0 is returned on success. A negative errno code is returned
4292  *	on a failure to set up the device, or if the name is a duplicate.
4293  *
4294  *	Callers must hold the rtnl semaphore. You may want
4295  *	register_netdev() instead of this.
4296  *
4297  *	BUGS:
4298  *	The locking appears insufficient to guarantee two parallel registers
4299  *	will not get the same name.
4300  */
4301 
4302 int register_netdevice(struct net_device *dev)
4303 {
4304 	struct hlist_head *head;
4305 	struct hlist_node *p;
4306 	int ret;
4307 	struct net *net = dev_net(dev);
4308 
4309 	BUG_ON(dev_boot_phase);
4310 	ASSERT_RTNL();
4311 
4312 	might_sleep();
4313 
4314 	/* When net_device's are persistent, this will be fatal. */
4315 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4316 	BUG_ON(!net);
4317 
4318 	spin_lock_init(&dev->addr_list_lock);
4319 	netdev_set_addr_lockdep_class(dev);
4320 	netdev_init_queue_locks(dev);
4321 
4322 	dev->iflink = -1;
4323 
4324 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4325 	/* Netdevice_ops API compatiability support.
4326 	 * This is temporary until all network devices are converted.
4327 	 */
4328 	if (dev->netdev_ops) {
4329 		const struct net_device_ops *ops = dev->netdev_ops;
4330 
4331 		dev->init = ops->ndo_init;
4332 		dev->uninit = ops->ndo_uninit;
4333 		dev->open = ops->ndo_open;
4334 		dev->change_rx_flags = ops->ndo_change_rx_flags;
4335 		dev->set_rx_mode = ops->ndo_set_rx_mode;
4336 		dev->set_multicast_list = ops->ndo_set_multicast_list;
4337 		dev->set_mac_address = ops->ndo_set_mac_address;
4338 		dev->validate_addr = ops->ndo_validate_addr;
4339 		dev->do_ioctl = ops->ndo_do_ioctl;
4340 		dev->set_config = ops->ndo_set_config;
4341 		dev->change_mtu = ops->ndo_change_mtu;
4342 		dev->tx_timeout = ops->ndo_tx_timeout;
4343 		dev->get_stats = ops->ndo_get_stats;
4344 		dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4345 		dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4346 		dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4347 #ifdef CONFIG_NET_POLL_CONTROLLER
4348 		dev->poll_controller = ops->ndo_poll_controller;
4349 #endif
4350 	} else {
4351 		char drivername[64];
4352 		pr_info("%s (%s): not using net_device_ops yet\n",
4353 			dev->name, netdev_drivername(dev, drivername, 64));
4354 
4355 		/* This works only because net_device_ops and the
4356 		   compatiablity structure are the same. */
4357 		dev->netdev_ops = (void *) &(dev->init);
4358 	}
4359 #endif
4360 
4361 	/* Init, if this function is available */
4362 	if (dev->netdev_ops->ndo_init) {
4363 		ret = dev->netdev_ops->ndo_init(dev);
4364 		if (ret) {
4365 			if (ret > 0)
4366 				ret = -EIO;
4367 			goto out;
4368 		}
4369 	}
4370 
4371 	if (!dev_valid_name(dev->name)) {
4372 		ret = -EINVAL;
4373 		goto err_uninit;
4374 	}
4375 
4376 	dev->ifindex = dev_new_index(net);
4377 	if (dev->iflink == -1)
4378 		dev->iflink = dev->ifindex;
4379 
4380 	/* Check for existence of name */
4381 	head = dev_name_hash(net, dev->name);
4382 	hlist_for_each(p, head) {
4383 		struct net_device *d
4384 			= hlist_entry(p, struct net_device, name_hlist);
4385 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4386 			ret = -EEXIST;
4387 			goto err_uninit;
4388 		}
4389 	}
4390 
4391 	/* Fix illegal checksum combinations */
4392 	if ((dev->features & NETIF_F_HW_CSUM) &&
4393 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4394 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4395 		       dev->name);
4396 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4397 	}
4398 
4399 	if ((dev->features & NETIF_F_NO_CSUM) &&
4400 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4401 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4402 		       dev->name);
4403 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4404 	}
4405 
4406 	dev->features = netdev_fix_features(dev->features, dev->name);
4407 
4408 	/* Enable software GSO if SG is supported. */
4409 	if (dev->features & NETIF_F_SG)
4410 		dev->features |= NETIF_F_GSO;
4411 
4412 	netdev_initialize_kobject(dev);
4413 	ret = netdev_register_kobject(dev);
4414 	if (ret)
4415 		goto err_uninit;
4416 	dev->reg_state = NETREG_REGISTERED;
4417 
4418 	/*
4419 	 *	Default initial state at registry is that the
4420 	 *	device is present.
4421 	 */
4422 
4423 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4424 
4425 	dev_init_scheduler(dev);
4426 	dev_hold(dev);
4427 	list_netdevice(dev);
4428 
4429 	/* Notify protocols, that a new device appeared. */
4430 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4431 	ret = notifier_to_errno(ret);
4432 	if (ret) {
4433 		rollback_registered(dev);
4434 		dev->reg_state = NETREG_UNREGISTERED;
4435 	}
4436 
4437 out:
4438 	return ret;
4439 
4440 err_uninit:
4441 	if (dev->netdev_ops->ndo_uninit)
4442 		dev->netdev_ops->ndo_uninit(dev);
4443 	goto out;
4444 }
4445 
4446 /**
4447  *	init_dummy_netdev	- init a dummy network device for NAPI
4448  *	@dev: device to init
4449  *
4450  *	This takes a network device structure and initialize the minimum
4451  *	amount of fields so it can be used to schedule NAPI polls without
4452  *	registering a full blown interface. This is to be used by drivers
4453  *	that need to tie several hardware interfaces to a single NAPI
4454  *	poll scheduler due to HW limitations.
4455  */
4456 int init_dummy_netdev(struct net_device *dev)
4457 {
4458 	/* Clear everything. Note we don't initialize spinlocks
4459 	 * are they aren't supposed to be taken by any of the
4460 	 * NAPI code and this dummy netdev is supposed to be
4461 	 * only ever used for NAPI polls
4462 	 */
4463 	memset(dev, 0, sizeof(struct net_device));
4464 
4465 	/* make sure we BUG if trying to hit standard
4466 	 * register/unregister code path
4467 	 */
4468 	dev->reg_state = NETREG_DUMMY;
4469 
4470 	/* initialize the ref count */
4471 	atomic_set(&dev->refcnt, 1);
4472 
4473 	/* NAPI wants this */
4474 	INIT_LIST_HEAD(&dev->napi_list);
4475 
4476 	/* a dummy interface is started by default */
4477 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4478 	set_bit(__LINK_STATE_START, &dev->state);
4479 
4480 	return 0;
4481 }
4482 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4483 
4484 
4485 /**
4486  *	register_netdev	- register a network device
4487  *	@dev: device to register
4488  *
4489  *	Take a completed network device structure and add it to the kernel
4490  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4491  *	chain. 0 is returned on success. A negative errno code is returned
4492  *	on a failure to set up the device, or if the name is a duplicate.
4493  *
4494  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4495  *	and expands the device name if you passed a format string to
4496  *	alloc_netdev.
4497  */
4498 int register_netdev(struct net_device *dev)
4499 {
4500 	int err;
4501 
4502 	rtnl_lock();
4503 
4504 	/*
4505 	 * If the name is a format string the caller wants us to do a
4506 	 * name allocation.
4507 	 */
4508 	if (strchr(dev->name, '%')) {
4509 		err = dev_alloc_name(dev, dev->name);
4510 		if (err < 0)
4511 			goto out;
4512 	}
4513 
4514 	err = register_netdevice(dev);
4515 out:
4516 	rtnl_unlock();
4517 	return err;
4518 }
4519 EXPORT_SYMBOL(register_netdev);
4520 
4521 /*
4522  * netdev_wait_allrefs - wait until all references are gone.
4523  *
4524  * This is called when unregistering network devices.
4525  *
4526  * Any protocol or device that holds a reference should register
4527  * for netdevice notification, and cleanup and put back the
4528  * reference if they receive an UNREGISTER event.
4529  * We can get stuck here if buggy protocols don't correctly
4530  * call dev_put.
4531  */
4532 static void netdev_wait_allrefs(struct net_device *dev)
4533 {
4534 	unsigned long rebroadcast_time, warning_time;
4535 
4536 	rebroadcast_time = warning_time = jiffies;
4537 	while (atomic_read(&dev->refcnt) != 0) {
4538 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4539 			rtnl_lock();
4540 
4541 			/* Rebroadcast unregister notification */
4542 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4543 
4544 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4545 				     &dev->state)) {
4546 				/* We must not have linkwatch events
4547 				 * pending on unregister. If this
4548 				 * happens, we simply run the queue
4549 				 * unscheduled, resulting in a noop
4550 				 * for this device.
4551 				 */
4552 				linkwatch_run_queue();
4553 			}
4554 
4555 			__rtnl_unlock();
4556 
4557 			rebroadcast_time = jiffies;
4558 		}
4559 
4560 		msleep(250);
4561 
4562 		if (time_after(jiffies, warning_time + 10 * HZ)) {
4563 			printk(KERN_EMERG "unregister_netdevice: "
4564 			       "waiting for %s to become free. Usage "
4565 			       "count = %d\n",
4566 			       dev->name, atomic_read(&dev->refcnt));
4567 			warning_time = jiffies;
4568 		}
4569 	}
4570 }
4571 
4572 /* The sequence is:
4573  *
4574  *	rtnl_lock();
4575  *	...
4576  *	register_netdevice(x1);
4577  *	register_netdevice(x2);
4578  *	...
4579  *	unregister_netdevice(y1);
4580  *	unregister_netdevice(y2);
4581  *      ...
4582  *	rtnl_unlock();
4583  *	free_netdev(y1);
4584  *	free_netdev(y2);
4585  *
4586  * We are invoked by rtnl_unlock().
4587  * This allows us to deal with problems:
4588  * 1) We can delete sysfs objects which invoke hotplug
4589  *    without deadlocking with linkwatch via keventd.
4590  * 2) Since we run with the RTNL semaphore not held, we can sleep
4591  *    safely in order to wait for the netdev refcnt to drop to zero.
4592  *
4593  * We must not return until all unregister events added during
4594  * the interval the lock was held have been completed.
4595  */
4596 void netdev_run_todo(void)
4597 {
4598 	struct list_head list;
4599 
4600 	/* Snapshot list, allow later requests */
4601 	list_replace_init(&net_todo_list, &list);
4602 
4603 	__rtnl_unlock();
4604 
4605 	while (!list_empty(&list)) {
4606 		struct net_device *dev
4607 			= list_entry(list.next, struct net_device, todo_list);
4608 		list_del(&dev->todo_list);
4609 
4610 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4611 			printk(KERN_ERR "network todo '%s' but state %d\n",
4612 			       dev->name, dev->reg_state);
4613 			dump_stack();
4614 			continue;
4615 		}
4616 
4617 		dev->reg_state = NETREG_UNREGISTERED;
4618 
4619 		on_each_cpu(flush_backlog, dev, 1);
4620 
4621 		netdev_wait_allrefs(dev);
4622 
4623 		/* paranoia */
4624 		BUG_ON(atomic_read(&dev->refcnt));
4625 		WARN_ON(dev->ip_ptr);
4626 		WARN_ON(dev->ip6_ptr);
4627 		WARN_ON(dev->dn_ptr);
4628 
4629 		if (dev->destructor)
4630 			dev->destructor(dev);
4631 
4632 		/* Free network device */
4633 		kobject_put(&dev->dev.kobj);
4634 	}
4635 }
4636 
4637 /**
4638  *	dev_get_stats	- get network device statistics
4639  *	@dev: device to get statistics from
4640  *
4641  *	Get network statistics from device. The device driver may provide
4642  *	its own method by setting dev->netdev_ops->get_stats; otherwise
4643  *	the internal statistics structure is used.
4644  */
4645 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4646  {
4647 	const struct net_device_ops *ops = dev->netdev_ops;
4648 
4649 	if (ops->ndo_get_stats)
4650 		return ops->ndo_get_stats(dev);
4651 	else
4652 		return &dev->stats;
4653 }
4654 EXPORT_SYMBOL(dev_get_stats);
4655 
4656 static void netdev_init_one_queue(struct net_device *dev,
4657 				  struct netdev_queue *queue,
4658 				  void *_unused)
4659 {
4660 	queue->dev = dev;
4661 }
4662 
4663 static void netdev_init_queues(struct net_device *dev)
4664 {
4665 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4666 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4667 	spin_lock_init(&dev->tx_global_lock);
4668 }
4669 
4670 /**
4671  *	alloc_netdev_mq - allocate network device
4672  *	@sizeof_priv:	size of private data to allocate space for
4673  *	@name:		device name format string
4674  *	@setup:		callback to initialize device
4675  *	@queue_count:	the number of subqueues to allocate
4676  *
4677  *	Allocates a struct net_device with private data area for driver use
4678  *	and performs basic initialization.  Also allocates subquue structs
4679  *	for each queue on the device at the end of the netdevice.
4680  */
4681 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4682 		void (*setup)(struct net_device *), unsigned int queue_count)
4683 {
4684 	struct netdev_queue *tx;
4685 	struct net_device *dev;
4686 	size_t alloc_size;
4687 	void *p;
4688 
4689 	BUG_ON(strlen(name) >= sizeof(dev->name));
4690 
4691 	alloc_size = sizeof(struct net_device);
4692 	if (sizeof_priv) {
4693 		/* ensure 32-byte alignment of private area */
4694 		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4695 		alloc_size += sizeof_priv;
4696 	}
4697 	/* ensure 32-byte alignment of whole construct */
4698 	alloc_size += NETDEV_ALIGN_CONST;
4699 
4700 	p = kzalloc(alloc_size, GFP_KERNEL);
4701 	if (!p) {
4702 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4703 		return NULL;
4704 	}
4705 
4706 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4707 	if (!tx) {
4708 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4709 		       "tx qdiscs.\n");
4710 		kfree(p);
4711 		return NULL;
4712 	}
4713 
4714 	dev = (struct net_device *)
4715 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4716 	dev->padded = (char *)dev - (char *)p;
4717 	dev_net_set(dev, &init_net);
4718 
4719 	dev->_tx = tx;
4720 	dev->num_tx_queues = queue_count;
4721 	dev->real_num_tx_queues = queue_count;
4722 
4723 	dev->gso_max_size = GSO_MAX_SIZE;
4724 
4725 	netdev_init_queues(dev);
4726 
4727 	INIT_LIST_HEAD(&dev->napi_list);
4728 	setup(dev);
4729 	strcpy(dev->name, name);
4730 	return dev;
4731 }
4732 EXPORT_SYMBOL(alloc_netdev_mq);
4733 
4734 /**
4735  *	free_netdev - free network device
4736  *	@dev: device
4737  *
4738  *	This function does the last stage of destroying an allocated device
4739  * 	interface. The reference to the device object is released.
4740  *	If this is the last reference then it will be freed.
4741  */
4742 void free_netdev(struct net_device *dev)
4743 {
4744 	struct napi_struct *p, *n;
4745 
4746 	release_net(dev_net(dev));
4747 
4748 	kfree(dev->_tx);
4749 
4750 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4751 		netif_napi_del(p);
4752 
4753 	/*  Compatibility with error handling in drivers */
4754 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4755 		kfree((char *)dev - dev->padded);
4756 		return;
4757 	}
4758 
4759 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4760 	dev->reg_state = NETREG_RELEASED;
4761 
4762 	/* will free via device release */
4763 	put_device(&dev->dev);
4764 }
4765 
4766 /**
4767  *	synchronize_net -  Synchronize with packet receive processing
4768  *
4769  *	Wait for packets currently being received to be done.
4770  *	Does not block later packets from starting.
4771  */
4772 void synchronize_net(void)
4773 {
4774 	might_sleep();
4775 	synchronize_rcu();
4776 }
4777 
4778 /**
4779  *	unregister_netdevice - remove device from the kernel
4780  *	@dev: device
4781  *
4782  *	This function shuts down a device interface and removes it
4783  *	from the kernel tables.
4784  *
4785  *	Callers must hold the rtnl semaphore.  You may want
4786  *	unregister_netdev() instead of this.
4787  */
4788 
4789 void unregister_netdevice(struct net_device *dev)
4790 {
4791 	ASSERT_RTNL();
4792 
4793 	rollback_registered(dev);
4794 	/* Finish processing unregister after unlock */
4795 	net_set_todo(dev);
4796 }
4797 
4798 /**
4799  *	unregister_netdev - remove device from the kernel
4800  *	@dev: device
4801  *
4802  *	This function shuts down a device interface and removes it
4803  *	from the kernel tables.
4804  *
4805  *	This is just a wrapper for unregister_netdevice that takes
4806  *	the rtnl semaphore.  In general you want to use this and not
4807  *	unregister_netdevice.
4808  */
4809 void unregister_netdev(struct net_device *dev)
4810 {
4811 	rtnl_lock();
4812 	unregister_netdevice(dev);
4813 	rtnl_unlock();
4814 }
4815 
4816 EXPORT_SYMBOL(unregister_netdev);
4817 
4818 /**
4819  *	dev_change_net_namespace - move device to different nethost namespace
4820  *	@dev: device
4821  *	@net: network namespace
4822  *	@pat: If not NULL name pattern to try if the current device name
4823  *	      is already taken in the destination network namespace.
4824  *
4825  *	This function shuts down a device interface and moves it
4826  *	to a new network namespace. On success 0 is returned, on
4827  *	a failure a netagive errno code is returned.
4828  *
4829  *	Callers must hold the rtnl semaphore.
4830  */
4831 
4832 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4833 {
4834 	char buf[IFNAMSIZ];
4835 	const char *destname;
4836 	int err;
4837 
4838 	ASSERT_RTNL();
4839 
4840 	/* Don't allow namespace local devices to be moved. */
4841 	err = -EINVAL;
4842 	if (dev->features & NETIF_F_NETNS_LOCAL)
4843 		goto out;
4844 
4845 #ifdef CONFIG_SYSFS
4846 	/* Don't allow real devices to be moved when sysfs
4847 	 * is enabled.
4848 	 */
4849 	err = -EINVAL;
4850 	if (dev->dev.parent)
4851 		goto out;
4852 #endif
4853 
4854 	/* Ensure the device has been registrered */
4855 	err = -EINVAL;
4856 	if (dev->reg_state != NETREG_REGISTERED)
4857 		goto out;
4858 
4859 	/* Get out if there is nothing todo */
4860 	err = 0;
4861 	if (net_eq(dev_net(dev), net))
4862 		goto out;
4863 
4864 	/* Pick the destination device name, and ensure
4865 	 * we can use it in the destination network namespace.
4866 	 */
4867 	err = -EEXIST;
4868 	destname = dev->name;
4869 	if (__dev_get_by_name(net, destname)) {
4870 		/* We get here if we can't use the current device name */
4871 		if (!pat)
4872 			goto out;
4873 		if (!dev_valid_name(pat))
4874 			goto out;
4875 		if (strchr(pat, '%')) {
4876 			if (__dev_alloc_name(net, pat, buf) < 0)
4877 				goto out;
4878 			destname = buf;
4879 		} else
4880 			destname = pat;
4881 		if (__dev_get_by_name(net, destname))
4882 			goto out;
4883 	}
4884 
4885 	/*
4886 	 * And now a mini version of register_netdevice unregister_netdevice.
4887 	 */
4888 
4889 	/* If device is running close it first. */
4890 	dev_close(dev);
4891 
4892 	/* And unlink it from device chain */
4893 	err = -ENODEV;
4894 	unlist_netdevice(dev);
4895 
4896 	synchronize_net();
4897 
4898 	/* Shutdown queueing discipline. */
4899 	dev_shutdown(dev);
4900 
4901 	/* Notify protocols, that we are about to destroy
4902 	   this device. They should clean all the things.
4903 	*/
4904 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4905 
4906 	/*
4907 	 *	Flush the unicast and multicast chains
4908 	 */
4909 	dev_addr_discard(dev);
4910 
4911 	netdev_unregister_kobject(dev);
4912 
4913 	/* Actually switch the network namespace */
4914 	dev_net_set(dev, net);
4915 
4916 	/* Assign the new device name */
4917 	if (destname != dev->name)
4918 		strcpy(dev->name, destname);
4919 
4920 	/* If there is an ifindex conflict assign a new one */
4921 	if (__dev_get_by_index(net, dev->ifindex)) {
4922 		int iflink = (dev->iflink == dev->ifindex);
4923 		dev->ifindex = dev_new_index(net);
4924 		if (iflink)
4925 			dev->iflink = dev->ifindex;
4926 	}
4927 
4928 	/* Fixup kobjects */
4929 	err = netdev_register_kobject(dev);
4930 	WARN_ON(err);
4931 
4932 	/* Add the device back in the hashes */
4933 	list_netdevice(dev);
4934 
4935 	/* Notify protocols, that a new device appeared. */
4936 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4937 
4938 	synchronize_net();
4939 	err = 0;
4940 out:
4941 	return err;
4942 }
4943 
4944 static int dev_cpu_callback(struct notifier_block *nfb,
4945 			    unsigned long action,
4946 			    void *ocpu)
4947 {
4948 	struct sk_buff **list_skb;
4949 	struct Qdisc **list_net;
4950 	struct sk_buff *skb;
4951 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4952 	struct softnet_data *sd, *oldsd;
4953 
4954 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4955 		return NOTIFY_OK;
4956 
4957 	local_irq_disable();
4958 	cpu = smp_processor_id();
4959 	sd = &per_cpu(softnet_data, cpu);
4960 	oldsd = &per_cpu(softnet_data, oldcpu);
4961 
4962 	/* Find end of our completion_queue. */
4963 	list_skb = &sd->completion_queue;
4964 	while (*list_skb)
4965 		list_skb = &(*list_skb)->next;
4966 	/* Append completion queue from offline CPU. */
4967 	*list_skb = oldsd->completion_queue;
4968 	oldsd->completion_queue = NULL;
4969 
4970 	/* Find end of our output_queue. */
4971 	list_net = &sd->output_queue;
4972 	while (*list_net)
4973 		list_net = &(*list_net)->next_sched;
4974 	/* Append output queue from offline CPU. */
4975 	*list_net = oldsd->output_queue;
4976 	oldsd->output_queue = NULL;
4977 
4978 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4979 	local_irq_enable();
4980 
4981 	/* Process offline CPU's input_pkt_queue */
4982 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4983 		netif_rx(skb);
4984 
4985 	return NOTIFY_OK;
4986 }
4987 
4988 
4989 /**
4990  *	netdev_increment_features - increment feature set by one
4991  *	@all: current feature set
4992  *	@one: new feature set
4993  *	@mask: mask feature set
4994  *
4995  *	Computes a new feature set after adding a device with feature set
4996  *	@one to the master device with current feature set @all.  Will not
4997  *	enable anything that is off in @mask. Returns the new feature set.
4998  */
4999 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5000 					unsigned long mask)
5001 {
5002 	/* If device needs checksumming, downgrade to it. */
5003         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5004 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5005 	else if (mask & NETIF_F_ALL_CSUM) {
5006 		/* If one device supports v4/v6 checksumming, set for all. */
5007 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5008 		    !(all & NETIF_F_GEN_CSUM)) {
5009 			all &= ~NETIF_F_ALL_CSUM;
5010 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5011 		}
5012 
5013 		/* If one device supports hw checksumming, set for all. */
5014 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5015 			all &= ~NETIF_F_ALL_CSUM;
5016 			all |= NETIF_F_HW_CSUM;
5017 		}
5018 	}
5019 
5020 	one |= NETIF_F_ALL_CSUM;
5021 
5022 	one |= all & NETIF_F_ONE_FOR_ALL;
5023 	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5024 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5025 
5026 	return all;
5027 }
5028 EXPORT_SYMBOL(netdev_increment_features);
5029 
5030 static struct hlist_head *netdev_create_hash(void)
5031 {
5032 	int i;
5033 	struct hlist_head *hash;
5034 
5035 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5036 	if (hash != NULL)
5037 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5038 			INIT_HLIST_HEAD(&hash[i]);
5039 
5040 	return hash;
5041 }
5042 
5043 /* Initialize per network namespace state */
5044 static int __net_init netdev_init(struct net *net)
5045 {
5046 	INIT_LIST_HEAD(&net->dev_base_head);
5047 
5048 	net->dev_name_head = netdev_create_hash();
5049 	if (net->dev_name_head == NULL)
5050 		goto err_name;
5051 
5052 	net->dev_index_head = netdev_create_hash();
5053 	if (net->dev_index_head == NULL)
5054 		goto err_idx;
5055 
5056 	return 0;
5057 
5058 err_idx:
5059 	kfree(net->dev_name_head);
5060 err_name:
5061 	return -ENOMEM;
5062 }
5063 
5064 /**
5065  *	netdev_drivername - network driver for the device
5066  *	@dev: network device
5067  *	@buffer: buffer for resulting name
5068  *	@len: size of buffer
5069  *
5070  *	Determine network driver for device.
5071  */
5072 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5073 {
5074 	const struct device_driver *driver;
5075 	const struct device *parent;
5076 
5077 	if (len <= 0 || !buffer)
5078 		return buffer;
5079 	buffer[0] = 0;
5080 
5081 	parent = dev->dev.parent;
5082 
5083 	if (!parent)
5084 		return buffer;
5085 
5086 	driver = parent->driver;
5087 	if (driver && driver->name)
5088 		strlcpy(buffer, driver->name, len);
5089 	return buffer;
5090 }
5091 
5092 static void __net_exit netdev_exit(struct net *net)
5093 {
5094 	kfree(net->dev_name_head);
5095 	kfree(net->dev_index_head);
5096 }
5097 
5098 static struct pernet_operations __net_initdata netdev_net_ops = {
5099 	.init = netdev_init,
5100 	.exit = netdev_exit,
5101 };
5102 
5103 static void __net_exit default_device_exit(struct net *net)
5104 {
5105 	struct net_device *dev;
5106 	/*
5107 	 * Push all migratable of the network devices back to the
5108 	 * initial network namespace
5109 	 */
5110 	rtnl_lock();
5111 restart:
5112 	for_each_netdev(net, dev) {
5113 		int err;
5114 		char fb_name[IFNAMSIZ];
5115 
5116 		/* Ignore unmoveable devices (i.e. loopback) */
5117 		if (dev->features & NETIF_F_NETNS_LOCAL)
5118 			continue;
5119 
5120 		/* Delete virtual devices */
5121 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5122 			dev->rtnl_link_ops->dellink(dev);
5123 			goto restart;
5124 		}
5125 
5126 		/* Push remaing network devices to init_net */
5127 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5128 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5129 		if (err) {
5130 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5131 				__func__, dev->name, err);
5132 			BUG();
5133 		}
5134 		goto restart;
5135 	}
5136 	rtnl_unlock();
5137 }
5138 
5139 static struct pernet_operations __net_initdata default_device_ops = {
5140 	.exit = default_device_exit,
5141 };
5142 
5143 /*
5144  *	Initialize the DEV module. At boot time this walks the device list and
5145  *	unhooks any devices that fail to initialise (normally hardware not
5146  *	present) and leaves us with a valid list of present and active devices.
5147  *
5148  */
5149 
5150 /*
5151  *       This is called single threaded during boot, so no need
5152  *       to take the rtnl semaphore.
5153  */
5154 static int __init net_dev_init(void)
5155 {
5156 	int i, rc = -ENOMEM;
5157 
5158 	BUG_ON(!dev_boot_phase);
5159 
5160 	if (dev_proc_init())
5161 		goto out;
5162 
5163 	if (netdev_kobject_init())
5164 		goto out;
5165 
5166 	INIT_LIST_HEAD(&ptype_all);
5167 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5168 		INIT_LIST_HEAD(&ptype_base[i]);
5169 
5170 	if (register_pernet_subsys(&netdev_net_ops))
5171 		goto out;
5172 
5173 	/*
5174 	 *	Initialise the packet receive queues.
5175 	 */
5176 
5177 	for_each_possible_cpu(i) {
5178 		struct softnet_data *queue;
5179 
5180 		queue = &per_cpu(softnet_data, i);
5181 		skb_queue_head_init(&queue->input_pkt_queue);
5182 		queue->completion_queue = NULL;
5183 		INIT_LIST_HEAD(&queue->poll_list);
5184 
5185 		queue->backlog.poll = process_backlog;
5186 		queue->backlog.weight = weight_p;
5187 		queue->backlog.gro_list = NULL;
5188 	}
5189 
5190 	dev_boot_phase = 0;
5191 
5192 	/* The loopback device is special if any other network devices
5193 	 * is present in a network namespace the loopback device must
5194 	 * be present. Since we now dynamically allocate and free the
5195 	 * loopback device ensure this invariant is maintained by
5196 	 * keeping the loopback device as the first device on the
5197 	 * list of network devices.  Ensuring the loopback devices
5198 	 * is the first device that appears and the last network device
5199 	 * that disappears.
5200 	 */
5201 	if (register_pernet_device(&loopback_net_ops))
5202 		goto out;
5203 
5204 	if (register_pernet_device(&default_device_ops))
5205 		goto out;
5206 
5207 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5208 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5209 
5210 	hotcpu_notifier(dev_cpu_callback, 0);
5211 	dst_init();
5212 	dev_mcast_init();
5213 	rc = 0;
5214 out:
5215 	return rc;
5216 }
5217 
5218 subsys_initcall(net_dev_init);
5219 
5220 EXPORT_SYMBOL(__dev_get_by_index);
5221 EXPORT_SYMBOL(__dev_get_by_name);
5222 EXPORT_SYMBOL(__dev_remove_pack);
5223 EXPORT_SYMBOL(dev_valid_name);
5224 EXPORT_SYMBOL(dev_add_pack);
5225 EXPORT_SYMBOL(dev_alloc_name);
5226 EXPORT_SYMBOL(dev_close);
5227 EXPORT_SYMBOL(dev_get_by_flags);
5228 EXPORT_SYMBOL(dev_get_by_index);
5229 EXPORT_SYMBOL(dev_get_by_name);
5230 EXPORT_SYMBOL(dev_open);
5231 EXPORT_SYMBOL(dev_queue_xmit);
5232 EXPORT_SYMBOL(dev_remove_pack);
5233 EXPORT_SYMBOL(dev_set_allmulti);
5234 EXPORT_SYMBOL(dev_set_promiscuity);
5235 EXPORT_SYMBOL(dev_change_flags);
5236 EXPORT_SYMBOL(dev_set_mtu);
5237 EXPORT_SYMBOL(dev_set_mac_address);
5238 EXPORT_SYMBOL(free_netdev);
5239 EXPORT_SYMBOL(netdev_boot_setup_check);
5240 EXPORT_SYMBOL(netdev_set_master);
5241 EXPORT_SYMBOL(netdev_state_change);
5242 EXPORT_SYMBOL(netif_receive_skb);
5243 EXPORT_SYMBOL(netif_rx);
5244 EXPORT_SYMBOL(register_gifconf);
5245 EXPORT_SYMBOL(register_netdevice);
5246 EXPORT_SYMBOL(register_netdevice_notifier);
5247 EXPORT_SYMBOL(skb_checksum_help);
5248 EXPORT_SYMBOL(synchronize_net);
5249 EXPORT_SYMBOL(unregister_netdevice);
5250 EXPORT_SYMBOL(unregister_netdevice_notifier);
5251 EXPORT_SYMBOL(net_enable_timestamp);
5252 EXPORT_SYMBOL(net_disable_timestamp);
5253 EXPORT_SYMBOL(dev_get_flags);
5254 
5255 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5256 EXPORT_SYMBOL(br_handle_frame_hook);
5257 EXPORT_SYMBOL(br_fdb_get_hook);
5258 EXPORT_SYMBOL(br_fdb_put_hook);
5259 #endif
5260 
5261 EXPORT_SYMBOL(dev_load);
5262 
5263 EXPORT_PER_CPU_SYMBOL(softnet_data);
5264