xref: /linux/net/core/dev.c (revision 32910e2c52cae552f2651c5360bae8033adb8aac)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/ethtool.h>
94 #include <linux/notifier.h>
95 #include <linux/skbuff.h>
96 #include <net/net_namespace.h>
97 #include <net/sock.h>
98 #include <linux/rtnetlink.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/stat.h>
102 #include <linux/if_bridge.h>
103 #include <linux/if_macvlan.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 
130 #include "net-sysfs.h"
131 
132 /* Instead of increasing this, you should create a hash table. */
133 #define MAX_GRO_SKBS 8
134 
135 /* This should be increased if a protocol with a bigger head is added. */
136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
137 
138 /*
139  *	The list of packet types we will receive (as opposed to discard)
140  *	and the routines to invoke.
141  *
142  *	Why 16. Because with 16 the only overlap we get on a hash of the
143  *	low nibble of the protocol value is RARP/SNAP/X.25.
144  *
145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
146  *             sure which should go first, but I bet it won't make much
147  *             difference if we are running VLANs.  The good news is that
148  *             this protocol won't be in the list unless compiled in, so
149  *             the average user (w/out VLANs) will not be adversely affected.
150  *             --BLG
151  *
152  *		0800	IP
153  *		8100    802.1Q VLAN
154  *		0001	802.3
155  *		0002	AX.25
156  *		0004	802.2
157  *		8035	RARP
158  *		0005	SNAP
159  *		0805	X.25
160  *		0806	ARP
161  *		8137	IPX
162  *		0009	Localtalk
163  *		86DD	IPv6
164  */
165 
166 #define PTYPE_HASH_SIZE	(16)
167 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
168 
169 static DEFINE_SPINLOCK(ptype_lock);
170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
171 static struct list_head ptype_all __read_mostly;	/* Taps */
172 
173 /*
174  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175  * semaphore.
176  *
177  * Pure readers hold dev_base_lock for reading.
178  *
179  * Writers must hold the rtnl semaphore while they loop through the
180  * dev_base_head list, and hold dev_base_lock for writing when they do the
181  * actual updates.  This allows pure readers to access the list even
182  * while a writer is preparing to update it.
183  *
184  * To put it another way, dev_base_lock is held for writing only to
185  * protect against pure readers; the rtnl semaphore provides the
186  * protection against other writers.
187  *
188  * See, for example usages, register_netdevice() and
189  * unregister_netdevice(), which must be called with the rtnl
190  * semaphore held.
191  */
192 DEFINE_RWLOCK(dev_base_lock);
193 
194 EXPORT_SYMBOL(dev_base_lock);
195 
196 #define NETDEV_HASHBITS	8
197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
198 
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
208 }
209 
210 /* Device list insertion */
211 static int list_netdevice(struct net_device *dev)
212 {
213 	struct net *net = dev_net(dev);
214 
215 	ASSERT_RTNL();
216 
217 	write_lock_bh(&dev_base_lock);
218 	list_add_tail(&dev->dev_list, &net->dev_base_head);
219 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
220 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
221 	write_unlock_bh(&dev_base_lock);
222 	return 0;
223 }
224 
225 /* Device list removal */
226 static void unlist_netdevice(struct net_device *dev)
227 {
228 	ASSERT_RTNL();
229 
230 	/* Unlink dev from the device chain */
231 	write_lock_bh(&dev_base_lock);
232 	list_del(&dev->dev_list);
233 	hlist_del(&dev->name_hlist);
234 	hlist_del(&dev->index_hlist);
235 	write_unlock_bh(&dev_base_lock);
236 }
237 
238 /*
239  *	Our notifier list
240  */
241 
242 static RAW_NOTIFIER_HEAD(netdev_chain);
243 
244 /*
245  *	Device drivers call our routines to queue packets here. We empty the
246  *	queue in the local softnet handler.
247  */
248 
249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
250 
251 #ifdef CONFIG_LOCKDEP
252 /*
253  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
254  * according to dev->type
255  */
256 static const unsigned short netdev_lock_type[] =
257 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
258 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
259 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
260 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
261 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
262 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
263 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
264 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
265 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
266 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
267 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
268 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
269 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
270 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
271 	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
272 
273 static const char *netdev_lock_name[] =
274 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
275 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
276 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
277 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
278 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
279 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
280 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
281 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
282 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
283 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
284 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
285 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
286 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
287 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
288 	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
289 
290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
292 
293 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
294 {
295 	int i;
296 
297 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
298 		if (netdev_lock_type[i] == dev_type)
299 			return i;
300 	/* the last key is used by default */
301 	return ARRAY_SIZE(netdev_lock_type) - 1;
302 }
303 
304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
305 						 unsigned short dev_type)
306 {
307 	int i;
308 
309 	i = netdev_lock_pos(dev_type);
310 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
311 				   netdev_lock_name[i]);
312 }
313 
314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
315 {
316 	int i;
317 
318 	i = netdev_lock_pos(dev->type);
319 	lockdep_set_class_and_name(&dev->addr_list_lock,
320 				   &netdev_addr_lock_key[i],
321 				   netdev_lock_name[i]);
322 }
323 #else
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 }
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330 }
331 #endif
332 
333 /*******************************************************************************
334 
335 		Protocol management and registration routines
336 
337 *******************************************************************************/
338 
339 /*
340  *	Add a protocol ID to the list. Now that the input handler is
341  *	smarter we can dispense with all the messy stuff that used to be
342  *	here.
343  *
344  *	BEWARE!!! Protocol handlers, mangling input packets,
345  *	MUST BE last in hash buckets and checking protocol handlers
346  *	MUST start from promiscuous ptype_all chain in net_bh.
347  *	It is true now, do not change it.
348  *	Explanation follows: if protocol handler, mangling packet, will
349  *	be the first on list, it is not able to sense, that packet
350  *	is cloned and should be copied-on-write, so that it will
351  *	change it and subsequent readers will get broken packet.
352  *							--ANK (980803)
353  */
354 
355 /**
356  *	dev_add_pack - add packet handler
357  *	@pt: packet type declaration
358  *
359  *	Add a protocol handler to the networking stack. The passed &packet_type
360  *	is linked into kernel lists and may not be freed until it has been
361  *	removed from the kernel lists.
362  *
363  *	This call does not sleep therefore it can not
364  *	guarantee all CPU's that are in middle of receiving packets
365  *	will see the new packet type (until the next received packet).
366  */
367 
368 void dev_add_pack(struct packet_type *pt)
369 {
370 	int hash;
371 
372 	spin_lock_bh(&ptype_lock);
373 	if (pt->type == htons(ETH_P_ALL))
374 		list_add_rcu(&pt->list, &ptype_all);
375 	else {
376 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
377 		list_add_rcu(&pt->list, &ptype_base[hash]);
378 	}
379 	spin_unlock_bh(&ptype_lock);
380 }
381 
382 /**
383  *	__dev_remove_pack	 - remove packet handler
384  *	@pt: packet type declaration
385  *
386  *	Remove a protocol handler that was previously added to the kernel
387  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
388  *	from the kernel lists and can be freed or reused once this function
389  *	returns.
390  *
391  *      The packet type might still be in use by receivers
392  *	and must not be freed until after all the CPU's have gone
393  *	through a quiescent state.
394  */
395 void __dev_remove_pack(struct packet_type *pt)
396 {
397 	struct list_head *head;
398 	struct packet_type *pt1;
399 
400 	spin_lock_bh(&ptype_lock);
401 
402 	if (pt->type == htons(ETH_P_ALL))
403 		head = &ptype_all;
404 	else
405 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
406 
407 	list_for_each_entry(pt1, head, list) {
408 		if (pt == pt1) {
409 			list_del_rcu(&pt->list);
410 			goto out;
411 		}
412 	}
413 
414 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
415 out:
416 	spin_unlock_bh(&ptype_lock);
417 }
418 /**
419  *	dev_remove_pack	 - remove packet handler
420  *	@pt: packet type declaration
421  *
422  *	Remove a protocol handler that was previously added to the kernel
423  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
424  *	from the kernel lists and can be freed or reused once this function
425  *	returns.
426  *
427  *	This call sleeps to guarantee that no CPU is looking at the packet
428  *	type after return.
429  */
430 void dev_remove_pack(struct packet_type *pt)
431 {
432 	__dev_remove_pack(pt);
433 
434 	synchronize_net();
435 }
436 
437 /******************************************************************************
438 
439 		      Device Boot-time Settings Routines
440 
441 *******************************************************************************/
442 
443 /* Boot time configuration table */
444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
445 
446 /**
447  *	netdev_boot_setup_add	- add new setup entry
448  *	@name: name of the device
449  *	@map: configured settings for the device
450  *
451  *	Adds new setup entry to the dev_boot_setup list.  The function
452  *	returns 0 on error and 1 on success.  This is a generic routine to
453  *	all netdevices.
454  */
455 static int netdev_boot_setup_add(char *name, struct ifmap *map)
456 {
457 	struct netdev_boot_setup *s;
458 	int i;
459 
460 	s = dev_boot_setup;
461 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
462 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
463 			memset(s[i].name, 0, sizeof(s[i].name));
464 			strlcpy(s[i].name, name, IFNAMSIZ);
465 			memcpy(&s[i].map, map, sizeof(s[i].map));
466 			break;
467 		}
468 	}
469 
470 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
471 }
472 
473 /**
474  *	netdev_boot_setup_check	- check boot time settings
475  *	@dev: the netdevice
476  *
477  * 	Check boot time settings for the device.
478  *	The found settings are set for the device to be used
479  *	later in the device probing.
480  *	Returns 0 if no settings found, 1 if they are.
481  */
482 int netdev_boot_setup_check(struct net_device *dev)
483 {
484 	struct netdev_boot_setup *s = dev_boot_setup;
485 	int i;
486 
487 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
488 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
489 		    !strcmp(dev->name, s[i].name)) {
490 			dev->irq 	= s[i].map.irq;
491 			dev->base_addr 	= s[i].map.base_addr;
492 			dev->mem_start 	= s[i].map.mem_start;
493 			dev->mem_end 	= s[i].map.mem_end;
494 			return 1;
495 		}
496 	}
497 	return 0;
498 }
499 
500 
501 /**
502  *	netdev_boot_base	- get address from boot time settings
503  *	@prefix: prefix for network device
504  *	@unit: id for network device
505  *
506  * 	Check boot time settings for the base address of device.
507  *	The found settings are set for the device to be used
508  *	later in the device probing.
509  *	Returns 0 if no settings found.
510  */
511 unsigned long netdev_boot_base(const char *prefix, int unit)
512 {
513 	const struct netdev_boot_setup *s = dev_boot_setup;
514 	char name[IFNAMSIZ];
515 	int i;
516 
517 	sprintf(name, "%s%d", prefix, unit);
518 
519 	/*
520 	 * If device already registered then return base of 1
521 	 * to indicate not to probe for this interface
522 	 */
523 	if (__dev_get_by_name(&init_net, name))
524 		return 1;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
527 		if (!strcmp(name, s[i].name))
528 			return s[i].map.base_addr;
529 	return 0;
530 }
531 
532 /*
533  * Saves at boot time configured settings for any netdevice.
534  */
535 int __init netdev_boot_setup(char *str)
536 {
537 	int ints[5];
538 	struct ifmap map;
539 
540 	str = get_options(str, ARRAY_SIZE(ints), ints);
541 	if (!str || !*str)
542 		return 0;
543 
544 	/* Save settings */
545 	memset(&map, 0, sizeof(map));
546 	if (ints[0] > 0)
547 		map.irq = ints[1];
548 	if (ints[0] > 1)
549 		map.base_addr = ints[2];
550 	if (ints[0] > 2)
551 		map.mem_start = ints[3];
552 	if (ints[0] > 3)
553 		map.mem_end = ints[4];
554 
555 	/* Add new entry to the list */
556 	return netdev_boot_setup_add(str, &map);
557 }
558 
559 __setup("netdev=", netdev_boot_setup);
560 
561 /*******************************************************************************
562 
563 			    Device Interface Subroutines
564 
565 *******************************************************************************/
566 
567 /**
568  *	__dev_get_by_name	- find a device by its name
569  *	@net: the applicable net namespace
570  *	@name: name to find
571  *
572  *	Find an interface by name. Must be called under RTNL semaphore
573  *	or @dev_base_lock. If the name is found a pointer to the device
574  *	is returned. If the name is not found then %NULL is returned. The
575  *	reference counters are not incremented so the caller must be
576  *	careful with locks.
577  */
578 
579 struct net_device *__dev_get_by_name(struct net *net, const char *name)
580 {
581 	struct hlist_node *p;
582 
583 	hlist_for_each(p, dev_name_hash(net, name)) {
584 		struct net_device *dev
585 			= hlist_entry(p, struct net_device, name_hlist);
586 		if (!strncmp(dev->name, name, IFNAMSIZ))
587 			return dev;
588 	}
589 	return NULL;
590 }
591 
592 /**
593  *	dev_get_by_name		- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. This can be called from any
598  *	context and does its own locking. The returned handle has
599  *	the usage count incremented and the caller must use dev_put() to
600  *	release it when it is no longer needed. %NULL is returned if no
601  *	matching device is found.
602  */
603 
604 struct net_device *dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct net_device *dev;
607 
608 	read_lock(&dev_base_lock);
609 	dev = __dev_get_by_name(net, name);
610 	if (dev)
611 		dev_hold(dev);
612 	read_unlock(&dev_base_lock);
613 	return dev;
614 }
615 
616 /**
617  *	__dev_get_by_index - find a device by its ifindex
618  *	@net: the applicable net namespace
619  *	@ifindex: index of device
620  *
621  *	Search for an interface by index. Returns %NULL if the device
622  *	is not found or a pointer to the device. The device has not
623  *	had its reference counter increased so the caller must be careful
624  *	about locking. The caller must hold either the RTNL semaphore
625  *	or @dev_base_lock.
626  */
627 
628 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
629 {
630 	struct hlist_node *p;
631 
632 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
633 		struct net_device *dev
634 			= hlist_entry(p, struct net_device, index_hlist);
635 		if (dev->ifindex == ifindex)
636 			return dev;
637 	}
638 	return NULL;
639 }
640 
641 
642 /**
643  *	dev_get_by_index - find a device by its ifindex
644  *	@net: the applicable net namespace
645  *	@ifindex: index of device
646  *
647  *	Search for an interface by index. Returns NULL if the device
648  *	is not found or a pointer to the device. The device returned has
649  *	had a reference added and the pointer is safe until the user calls
650  *	dev_put to indicate they have finished with it.
651  */
652 
653 struct net_device *dev_get_by_index(struct net *net, int ifindex)
654 {
655 	struct net_device *dev;
656 
657 	read_lock(&dev_base_lock);
658 	dev = __dev_get_by_index(net, ifindex);
659 	if (dev)
660 		dev_hold(dev);
661 	read_unlock(&dev_base_lock);
662 	return dev;
663 }
664 
665 /**
666  *	dev_getbyhwaddr - find a device by its hardware address
667  *	@net: the applicable net namespace
668  *	@type: media type of device
669  *	@ha: hardware address
670  *
671  *	Search for an interface by MAC address. Returns NULL if the device
672  *	is not found or a pointer to the device. The caller must hold the
673  *	rtnl semaphore. The returned device has not had its ref count increased
674  *	and the caller must therefore be careful about locking
675  *
676  *	BUGS:
677  *	If the API was consistent this would be __dev_get_by_hwaddr
678  */
679 
680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
681 {
682 	struct net_device *dev;
683 
684 	ASSERT_RTNL();
685 
686 	for_each_netdev(net, dev)
687 		if (dev->type == type &&
688 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
689 			return dev;
690 
691 	return NULL;
692 }
693 
694 EXPORT_SYMBOL(dev_getbyhwaddr);
695 
696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
697 {
698 	struct net_device *dev;
699 
700 	ASSERT_RTNL();
701 	for_each_netdev(net, dev)
702 		if (dev->type == type)
703 			return dev;
704 
705 	return NULL;
706 }
707 
708 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
709 
710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
711 {
712 	struct net_device *dev;
713 
714 	rtnl_lock();
715 	dev = __dev_getfirstbyhwtype(net, type);
716 	if (dev)
717 		dev_hold(dev);
718 	rtnl_unlock();
719 	return dev;
720 }
721 
722 EXPORT_SYMBOL(dev_getfirstbyhwtype);
723 
724 /**
725  *	dev_get_by_flags - find any device with given flags
726  *	@net: the applicable net namespace
727  *	@if_flags: IFF_* values
728  *	@mask: bitmask of bits in if_flags to check
729  *
730  *	Search for any interface with the given flags. Returns NULL if a device
731  *	is not found or a pointer to the device. The device returned has
732  *	had a reference added and the pointer is safe until the user calls
733  *	dev_put to indicate they have finished with it.
734  */
735 
736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
737 {
738 	struct net_device *dev, *ret;
739 
740 	ret = NULL;
741 	read_lock(&dev_base_lock);
742 	for_each_netdev(net, dev) {
743 		if (((dev->flags ^ if_flags) & mask) == 0) {
744 			dev_hold(dev);
745 			ret = dev;
746 			break;
747 		}
748 	}
749 	read_unlock(&dev_base_lock);
750 	return ret;
751 }
752 
753 /**
754  *	dev_valid_name - check if name is okay for network device
755  *	@name: name string
756  *
757  *	Network device names need to be valid file names to
758  *	to allow sysfs to work.  We also disallow any kind of
759  *	whitespace.
760  */
761 int dev_valid_name(const char *name)
762 {
763 	if (*name == '\0')
764 		return 0;
765 	if (strlen(name) >= IFNAMSIZ)
766 		return 0;
767 	if (!strcmp(name, ".") || !strcmp(name, ".."))
768 		return 0;
769 
770 	while (*name) {
771 		if (*name == '/' || isspace(*name))
772 			return 0;
773 		name++;
774 	}
775 	return 1;
776 }
777 
778 /**
779  *	__dev_alloc_name - allocate a name for a device
780  *	@net: network namespace to allocate the device name in
781  *	@name: name format string
782  *	@buf:  scratch buffer and result name string
783  *
784  *	Passed a format string - eg "lt%d" it will try and find a suitable
785  *	id. It scans list of devices to build up a free map, then chooses
786  *	the first empty slot. The caller must hold the dev_base or rtnl lock
787  *	while allocating the name and adding the device in order to avoid
788  *	duplicates.
789  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
790  *	Returns the number of the unit assigned or a negative errno code.
791  */
792 
793 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
794 {
795 	int i = 0;
796 	const char *p;
797 	const int max_netdevices = 8*PAGE_SIZE;
798 	unsigned long *inuse;
799 	struct net_device *d;
800 
801 	p = strnchr(name, IFNAMSIZ-1, '%');
802 	if (p) {
803 		/*
804 		 * Verify the string as this thing may have come from
805 		 * the user.  There must be either one "%d" and no other "%"
806 		 * characters.
807 		 */
808 		if (p[1] != 'd' || strchr(p + 2, '%'))
809 			return -EINVAL;
810 
811 		/* Use one page as a bit array of possible slots */
812 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
813 		if (!inuse)
814 			return -ENOMEM;
815 
816 		for_each_netdev(net, d) {
817 			if (!sscanf(d->name, name, &i))
818 				continue;
819 			if (i < 0 || i >= max_netdevices)
820 				continue;
821 
822 			/*  avoid cases where sscanf is not exact inverse of printf */
823 			snprintf(buf, IFNAMSIZ, name, i);
824 			if (!strncmp(buf, d->name, IFNAMSIZ))
825 				set_bit(i, inuse);
826 		}
827 
828 		i = find_first_zero_bit(inuse, max_netdevices);
829 		free_page((unsigned long) inuse);
830 	}
831 
832 	snprintf(buf, IFNAMSIZ, name, i);
833 	if (!__dev_get_by_name(net, buf))
834 		return i;
835 
836 	/* It is possible to run out of possible slots
837 	 * when the name is long and there isn't enough space left
838 	 * for the digits, or if all bits are used.
839 	 */
840 	return -ENFILE;
841 }
842 
843 /**
844  *	dev_alloc_name - allocate a name for a device
845  *	@dev: device
846  *	@name: name format string
847  *
848  *	Passed a format string - eg "lt%d" it will try and find a suitable
849  *	id. It scans list of devices to build up a free map, then chooses
850  *	the first empty slot. The caller must hold the dev_base or rtnl lock
851  *	while allocating the name and adding the device in order to avoid
852  *	duplicates.
853  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
854  *	Returns the number of the unit assigned or a negative errno code.
855  */
856 
857 int dev_alloc_name(struct net_device *dev, const char *name)
858 {
859 	char buf[IFNAMSIZ];
860 	struct net *net;
861 	int ret;
862 
863 	BUG_ON(!dev_net(dev));
864 	net = dev_net(dev);
865 	ret = __dev_alloc_name(net, name, buf);
866 	if (ret >= 0)
867 		strlcpy(dev->name, buf, IFNAMSIZ);
868 	return ret;
869 }
870 
871 
872 /**
873  *	dev_change_name - change name of a device
874  *	@dev: device
875  *	@newname: name (or format string) must be at least IFNAMSIZ
876  *
877  *	Change name of a device, can pass format strings "eth%d".
878  *	for wildcarding.
879  */
880 int dev_change_name(struct net_device *dev, const char *newname)
881 {
882 	char oldname[IFNAMSIZ];
883 	int err = 0;
884 	int ret;
885 	struct net *net;
886 
887 	ASSERT_RTNL();
888 	BUG_ON(!dev_net(dev));
889 
890 	net = dev_net(dev);
891 	if (dev->flags & IFF_UP)
892 		return -EBUSY;
893 
894 	if (!dev_valid_name(newname))
895 		return -EINVAL;
896 
897 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
898 		return 0;
899 
900 	memcpy(oldname, dev->name, IFNAMSIZ);
901 
902 	if (strchr(newname, '%')) {
903 		err = dev_alloc_name(dev, newname);
904 		if (err < 0)
905 			return err;
906 	}
907 	else if (__dev_get_by_name(net, newname))
908 		return -EEXIST;
909 	else
910 		strlcpy(dev->name, newname, IFNAMSIZ);
911 
912 rollback:
913 	/* For now only devices in the initial network namespace
914 	 * are in sysfs.
915 	 */
916 	if (net == &init_net) {
917 		ret = device_rename(&dev->dev, dev->name);
918 		if (ret) {
919 			memcpy(dev->name, oldname, IFNAMSIZ);
920 			return ret;
921 		}
922 	}
923 
924 	write_lock_bh(&dev_base_lock);
925 	hlist_del(&dev->name_hlist);
926 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
927 	write_unlock_bh(&dev_base_lock);
928 
929 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
930 	ret = notifier_to_errno(ret);
931 
932 	if (ret) {
933 		if (err) {
934 			printk(KERN_ERR
935 			       "%s: name change rollback failed: %d.\n",
936 			       dev->name, ret);
937 		} else {
938 			err = ret;
939 			memcpy(dev->name, oldname, IFNAMSIZ);
940 			goto rollback;
941 		}
942 	}
943 
944 	return err;
945 }
946 
947 /**
948  *	dev_set_alias - change ifalias of a device
949  *	@dev: device
950  *	@alias: name up to IFALIASZ
951  *	@len: limit of bytes to copy from info
952  *
953  *	Set ifalias for a device,
954  */
955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
956 {
957 	ASSERT_RTNL();
958 
959 	if (len >= IFALIASZ)
960 		return -EINVAL;
961 
962 	if (!len) {
963 		if (dev->ifalias) {
964 			kfree(dev->ifalias);
965 			dev->ifalias = NULL;
966 		}
967 		return 0;
968 	}
969 
970 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
971 	if (!dev->ifalias)
972 		return -ENOMEM;
973 
974 	strlcpy(dev->ifalias, alias, len+1);
975 	return len;
976 }
977 
978 
979 /**
980  *	netdev_features_change - device changes features
981  *	@dev: device to cause notification
982  *
983  *	Called to indicate a device has changed features.
984  */
985 void netdev_features_change(struct net_device *dev)
986 {
987 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
988 }
989 EXPORT_SYMBOL(netdev_features_change);
990 
991 /**
992  *	netdev_state_change - device changes state
993  *	@dev: device to cause notification
994  *
995  *	Called to indicate a device has changed state. This function calls
996  *	the notifier chains for netdev_chain and sends a NEWLINK message
997  *	to the routing socket.
998  */
999 void netdev_state_change(struct net_device *dev)
1000 {
1001 	if (dev->flags & IFF_UP) {
1002 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004 	}
1005 }
1006 
1007 void netdev_bonding_change(struct net_device *dev)
1008 {
1009 	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010 }
1011 EXPORT_SYMBOL(netdev_bonding_change);
1012 
1013 /**
1014  *	dev_load 	- load a network module
1015  *	@net: the applicable net namespace
1016  *	@name: name of interface
1017  *
1018  *	If a network interface is not present and the process has suitable
1019  *	privileges this function loads the module. If module loading is not
1020  *	available in this kernel then it becomes a nop.
1021  */
1022 
1023 void dev_load(struct net *net, const char *name)
1024 {
1025 	struct net_device *dev;
1026 
1027 	read_lock(&dev_base_lock);
1028 	dev = __dev_get_by_name(net, name);
1029 	read_unlock(&dev_base_lock);
1030 
1031 	if (!dev && capable(CAP_SYS_MODULE))
1032 		request_module("%s", name);
1033 }
1034 
1035 /**
1036  *	dev_open	- prepare an interface for use.
1037  *	@dev:	device to open
1038  *
1039  *	Takes a device from down to up state. The device's private open
1040  *	function is invoked and then the multicast lists are loaded. Finally
1041  *	the device is moved into the up state and a %NETDEV_UP message is
1042  *	sent to the netdev notifier chain.
1043  *
1044  *	Calling this function on an active interface is a nop. On a failure
1045  *	a negative errno code is returned.
1046  */
1047 int dev_open(struct net_device *dev)
1048 {
1049 	const struct net_device_ops *ops = dev->netdev_ops;
1050 	int ret = 0;
1051 
1052 	ASSERT_RTNL();
1053 
1054 	/*
1055 	 *	Is it already up?
1056 	 */
1057 
1058 	if (dev->flags & IFF_UP)
1059 		return 0;
1060 
1061 	/*
1062 	 *	Is it even present?
1063 	 */
1064 	if (!netif_device_present(dev))
1065 		return -ENODEV;
1066 
1067 	/*
1068 	 *	Call device private open method
1069 	 */
1070 	set_bit(__LINK_STATE_START, &dev->state);
1071 
1072 	if (ops->ndo_validate_addr)
1073 		ret = ops->ndo_validate_addr(dev);
1074 
1075 	if (!ret && ops->ndo_open)
1076 		ret = ops->ndo_open(dev);
1077 
1078 	/*
1079 	 *	If it went open OK then:
1080 	 */
1081 
1082 	if (ret)
1083 		clear_bit(__LINK_STATE_START, &dev->state);
1084 	else {
1085 		/*
1086 		 *	Set the flags.
1087 		 */
1088 		dev->flags |= IFF_UP;
1089 
1090 		/*
1091 		 *	Enable NET_DMA
1092 		 */
1093 		net_dmaengine_get();
1094 
1095 		/*
1096 		 *	Initialize multicasting status
1097 		 */
1098 		dev_set_rx_mode(dev);
1099 
1100 		/*
1101 		 *	Wakeup transmit queue engine
1102 		 */
1103 		dev_activate(dev);
1104 
1105 		/*
1106 		 *	... and announce new interface.
1107 		 */
1108 		call_netdevice_notifiers(NETDEV_UP, dev);
1109 	}
1110 
1111 	return ret;
1112 }
1113 
1114 /**
1115  *	dev_close - shutdown an interface.
1116  *	@dev: device to shutdown
1117  *
1118  *	This function moves an active device into down state. A
1119  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1120  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1121  *	chain.
1122  */
1123 int dev_close(struct net_device *dev)
1124 {
1125 	const struct net_device_ops *ops = dev->netdev_ops;
1126 	ASSERT_RTNL();
1127 
1128 	might_sleep();
1129 
1130 	if (!(dev->flags & IFF_UP))
1131 		return 0;
1132 
1133 	/*
1134 	 *	Tell people we are going down, so that they can
1135 	 *	prepare to death, when device is still operating.
1136 	 */
1137 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1138 
1139 	clear_bit(__LINK_STATE_START, &dev->state);
1140 
1141 	/* Synchronize to scheduled poll. We cannot touch poll list,
1142 	 * it can be even on different cpu. So just clear netif_running().
1143 	 *
1144 	 * dev->stop() will invoke napi_disable() on all of it's
1145 	 * napi_struct instances on this device.
1146 	 */
1147 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1148 
1149 	dev_deactivate(dev);
1150 
1151 	/*
1152 	 *	Call the device specific close. This cannot fail.
1153 	 *	Only if device is UP
1154 	 *
1155 	 *	We allow it to be called even after a DETACH hot-plug
1156 	 *	event.
1157 	 */
1158 	if (ops->ndo_stop)
1159 		ops->ndo_stop(dev);
1160 
1161 	/*
1162 	 *	Device is now down.
1163 	 */
1164 
1165 	dev->flags &= ~IFF_UP;
1166 
1167 	/*
1168 	 * Tell people we are down
1169 	 */
1170 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1171 
1172 	/*
1173 	 *	Shutdown NET_DMA
1174 	 */
1175 	net_dmaengine_put();
1176 
1177 	return 0;
1178 }
1179 
1180 
1181 /**
1182  *	dev_disable_lro - disable Large Receive Offload on a device
1183  *	@dev: device
1184  *
1185  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1186  *	called under RTNL.  This is needed if received packets may be
1187  *	forwarded to another interface.
1188  */
1189 void dev_disable_lro(struct net_device *dev)
1190 {
1191 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1192 	    dev->ethtool_ops->set_flags) {
1193 		u32 flags = dev->ethtool_ops->get_flags(dev);
1194 		if (flags & ETH_FLAG_LRO) {
1195 			flags &= ~ETH_FLAG_LRO;
1196 			dev->ethtool_ops->set_flags(dev, flags);
1197 		}
1198 	}
1199 	WARN_ON(dev->features & NETIF_F_LRO);
1200 }
1201 EXPORT_SYMBOL(dev_disable_lro);
1202 
1203 
1204 static int dev_boot_phase = 1;
1205 
1206 /*
1207  *	Device change register/unregister. These are not inline or static
1208  *	as we export them to the world.
1209  */
1210 
1211 /**
1212  *	register_netdevice_notifier - register a network notifier block
1213  *	@nb: notifier
1214  *
1215  *	Register a notifier to be called when network device events occur.
1216  *	The notifier passed is linked into the kernel structures and must
1217  *	not be reused until it has been unregistered. A negative errno code
1218  *	is returned on a failure.
1219  *
1220  * 	When registered all registration and up events are replayed
1221  *	to the new notifier to allow device to have a race free
1222  *	view of the network device list.
1223  */
1224 
1225 int register_netdevice_notifier(struct notifier_block *nb)
1226 {
1227 	struct net_device *dev;
1228 	struct net_device *last;
1229 	struct net *net;
1230 	int err;
1231 
1232 	rtnl_lock();
1233 	err = raw_notifier_chain_register(&netdev_chain, nb);
1234 	if (err)
1235 		goto unlock;
1236 	if (dev_boot_phase)
1237 		goto unlock;
1238 	for_each_net(net) {
1239 		for_each_netdev(net, dev) {
1240 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1241 			err = notifier_to_errno(err);
1242 			if (err)
1243 				goto rollback;
1244 
1245 			if (!(dev->flags & IFF_UP))
1246 				continue;
1247 
1248 			nb->notifier_call(nb, NETDEV_UP, dev);
1249 		}
1250 	}
1251 
1252 unlock:
1253 	rtnl_unlock();
1254 	return err;
1255 
1256 rollback:
1257 	last = dev;
1258 	for_each_net(net) {
1259 		for_each_netdev(net, dev) {
1260 			if (dev == last)
1261 				break;
1262 
1263 			if (dev->flags & IFF_UP) {
1264 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1265 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1266 			}
1267 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1268 		}
1269 	}
1270 
1271 	raw_notifier_chain_unregister(&netdev_chain, nb);
1272 	goto unlock;
1273 }
1274 
1275 /**
1276  *	unregister_netdevice_notifier - unregister a network notifier block
1277  *	@nb: notifier
1278  *
1279  *	Unregister a notifier previously registered by
1280  *	register_netdevice_notifier(). The notifier is unlinked into the
1281  *	kernel structures and may then be reused. A negative errno code
1282  *	is returned on a failure.
1283  */
1284 
1285 int unregister_netdevice_notifier(struct notifier_block *nb)
1286 {
1287 	int err;
1288 
1289 	rtnl_lock();
1290 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1291 	rtnl_unlock();
1292 	return err;
1293 }
1294 
1295 /**
1296  *	call_netdevice_notifiers - call all network notifier blocks
1297  *      @val: value passed unmodified to notifier function
1298  *      @dev: net_device pointer passed unmodified to notifier function
1299  *
1300  *	Call all network notifier blocks.  Parameters and return value
1301  *	are as for raw_notifier_call_chain().
1302  */
1303 
1304 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1305 {
1306 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1307 }
1308 
1309 /* When > 0 there are consumers of rx skb time stamps */
1310 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1311 
1312 void net_enable_timestamp(void)
1313 {
1314 	atomic_inc(&netstamp_needed);
1315 }
1316 
1317 void net_disable_timestamp(void)
1318 {
1319 	atomic_dec(&netstamp_needed);
1320 }
1321 
1322 static inline void net_timestamp(struct sk_buff *skb)
1323 {
1324 	if (atomic_read(&netstamp_needed))
1325 		__net_timestamp(skb);
1326 	else
1327 		skb->tstamp.tv64 = 0;
1328 }
1329 
1330 /*
1331  *	Support routine. Sends outgoing frames to any network
1332  *	taps currently in use.
1333  */
1334 
1335 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1336 {
1337 	struct packet_type *ptype;
1338 
1339 	net_timestamp(skb);
1340 
1341 	rcu_read_lock();
1342 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1343 		/* Never send packets back to the socket
1344 		 * they originated from - MvS (miquels@drinkel.ow.org)
1345 		 */
1346 		if ((ptype->dev == dev || !ptype->dev) &&
1347 		    (ptype->af_packet_priv == NULL ||
1348 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1349 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1350 			if (!skb2)
1351 				break;
1352 
1353 			/* skb->nh should be correctly
1354 			   set by sender, so that the second statement is
1355 			   just protection against buggy protocols.
1356 			 */
1357 			skb_reset_mac_header(skb2);
1358 
1359 			if (skb_network_header(skb2) < skb2->data ||
1360 			    skb2->network_header > skb2->tail) {
1361 				if (net_ratelimit())
1362 					printk(KERN_CRIT "protocol %04x is "
1363 					       "buggy, dev %s\n",
1364 					       skb2->protocol, dev->name);
1365 				skb_reset_network_header(skb2);
1366 			}
1367 
1368 			skb2->transport_header = skb2->network_header;
1369 			skb2->pkt_type = PACKET_OUTGOING;
1370 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1371 		}
1372 	}
1373 	rcu_read_unlock();
1374 }
1375 
1376 
1377 static inline void __netif_reschedule(struct Qdisc *q)
1378 {
1379 	struct softnet_data *sd;
1380 	unsigned long flags;
1381 
1382 	local_irq_save(flags);
1383 	sd = &__get_cpu_var(softnet_data);
1384 	q->next_sched = sd->output_queue;
1385 	sd->output_queue = q;
1386 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1387 	local_irq_restore(flags);
1388 }
1389 
1390 void __netif_schedule(struct Qdisc *q)
1391 {
1392 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1393 		__netif_reschedule(q);
1394 }
1395 EXPORT_SYMBOL(__netif_schedule);
1396 
1397 void dev_kfree_skb_irq(struct sk_buff *skb)
1398 {
1399 	if (atomic_dec_and_test(&skb->users)) {
1400 		struct softnet_data *sd;
1401 		unsigned long flags;
1402 
1403 		local_irq_save(flags);
1404 		sd = &__get_cpu_var(softnet_data);
1405 		skb->next = sd->completion_queue;
1406 		sd->completion_queue = skb;
1407 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1408 		local_irq_restore(flags);
1409 	}
1410 }
1411 EXPORT_SYMBOL(dev_kfree_skb_irq);
1412 
1413 void dev_kfree_skb_any(struct sk_buff *skb)
1414 {
1415 	if (in_irq() || irqs_disabled())
1416 		dev_kfree_skb_irq(skb);
1417 	else
1418 		dev_kfree_skb(skb);
1419 }
1420 EXPORT_SYMBOL(dev_kfree_skb_any);
1421 
1422 
1423 /**
1424  * netif_device_detach - mark device as removed
1425  * @dev: network device
1426  *
1427  * Mark device as removed from system and therefore no longer available.
1428  */
1429 void netif_device_detach(struct net_device *dev)
1430 {
1431 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1432 	    netif_running(dev)) {
1433 		netif_stop_queue(dev);
1434 	}
1435 }
1436 EXPORT_SYMBOL(netif_device_detach);
1437 
1438 /**
1439  * netif_device_attach - mark device as attached
1440  * @dev: network device
1441  *
1442  * Mark device as attached from system and restart if needed.
1443  */
1444 void netif_device_attach(struct net_device *dev)
1445 {
1446 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1447 	    netif_running(dev)) {
1448 		netif_wake_queue(dev);
1449 		__netdev_watchdog_up(dev);
1450 	}
1451 }
1452 EXPORT_SYMBOL(netif_device_attach);
1453 
1454 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1455 {
1456 	return ((features & NETIF_F_GEN_CSUM) ||
1457 		((features & NETIF_F_IP_CSUM) &&
1458 		 protocol == htons(ETH_P_IP)) ||
1459 		((features & NETIF_F_IPV6_CSUM) &&
1460 		 protocol == htons(ETH_P_IPV6)));
1461 }
1462 
1463 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1464 {
1465 	if (can_checksum_protocol(dev->features, skb->protocol))
1466 		return true;
1467 
1468 	if (skb->protocol == htons(ETH_P_8021Q)) {
1469 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1470 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1471 					  veh->h_vlan_encapsulated_proto))
1472 			return true;
1473 	}
1474 
1475 	return false;
1476 }
1477 
1478 /*
1479  * Invalidate hardware checksum when packet is to be mangled, and
1480  * complete checksum manually on outgoing path.
1481  */
1482 int skb_checksum_help(struct sk_buff *skb)
1483 {
1484 	__wsum csum;
1485 	int ret = 0, offset;
1486 
1487 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1488 		goto out_set_summed;
1489 
1490 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1491 		/* Let GSO fix up the checksum. */
1492 		goto out_set_summed;
1493 	}
1494 
1495 	offset = skb->csum_start - skb_headroom(skb);
1496 	BUG_ON(offset >= skb_headlen(skb));
1497 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1498 
1499 	offset += skb->csum_offset;
1500 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1501 
1502 	if (skb_cloned(skb) &&
1503 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1504 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1505 		if (ret)
1506 			goto out;
1507 	}
1508 
1509 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1510 out_set_summed:
1511 	skb->ip_summed = CHECKSUM_NONE;
1512 out:
1513 	return ret;
1514 }
1515 
1516 /**
1517  *	skb_gso_segment - Perform segmentation on skb.
1518  *	@skb: buffer to segment
1519  *	@features: features for the output path (see dev->features)
1520  *
1521  *	This function segments the given skb and returns a list of segments.
1522  *
1523  *	It may return NULL if the skb requires no segmentation.  This is
1524  *	only possible when GSO is used for verifying header integrity.
1525  */
1526 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1527 {
1528 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1529 	struct packet_type *ptype;
1530 	__be16 type = skb->protocol;
1531 	int err;
1532 
1533 	skb_reset_mac_header(skb);
1534 	skb->mac_len = skb->network_header - skb->mac_header;
1535 	__skb_pull(skb, skb->mac_len);
1536 
1537 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1538 		struct net_device *dev = skb->dev;
1539 		struct ethtool_drvinfo info = {};
1540 
1541 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1542 			dev->ethtool_ops->get_drvinfo(dev, &info);
1543 
1544 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1545 			"ip_summed=%d",
1546 		     info.driver, dev ? dev->features : 0L,
1547 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1548 		     skb->len, skb->data_len, skb->ip_summed);
1549 
1550 		if (skb_header_cloned(skb) &&
1551 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1552 			return ERR_PTR(err);
1553 	}
1554 
1555 	rcu_read_lock();
1556 	list_for_each_entry_rcu(ptype,
1557 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1558 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1559 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1560 				err = ptype->gso_send_check(skb);
1561 				segs = ERR_PTR(err);
1562 				if (err || skb_gso_ok(skb, features))
1563 					break;
1564 				__skb_push(skb, (skb->data -
1565 						 skb_network_header(skb)));
1566 			}
1567 			segs = ptype->gso_segment(skb, features);
1568 			break;
1569 		}
1570 	}
1571 	rcu_read_unlock();
1572 
1573 	__skb_push(skb, skb->data - skb_mac_header(skb));
1574 
1575 	return segs;
1576 }
1577 
1578 EXPORT_SYMBOL(skb_gso_segment);
1579 
1580 /* Take action when hardware reception checksum errors are detected. */
1581 #ifdef CONFIG_BUG
1582 void netdev_rx_csum_fault(struct net_device *dev)
1583 {
1584 	if (net_ratelimit()) {
1585 		printk(KERN_ERR "%s: hw csum failure.\n",
1586 			dev ? dev->name : "<unknown>");
1587 		dump_stack();
1588 	}
1589 }
1590 EXPORT_SYMBOL(netdev_rx_csum_fault);
1591 #endif
1592 
1593 /* Actually, we should eliminate this check as soon as we know, that:
1594  * 1. IOMMU is present and allows to map all the memory.
1595  * 2. No high memory really exists on this machine.
1596  */
1597 
1598 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1599 {
1600 #ifdef CONFIG_HIGHMEM
1601 	int i;
1602 
1603 	if (dev->features & NETIF_F_HIGHDMA)
1604 		return 0;
1605 
1606 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1607 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1608 			return 1;
1609 
1610 #endif
1611 	return 0;
1612 }
1613 
1614 struct dev_gso_cb {
1615 	void (*destructor)(struct sk_buff *skb);
1616 };
1617 
1618 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1619 
1620 static void dev_gso_skb_destructor(struct sk_buff *skb)
1621 {
1622 	struct dev_gso_cb *cb;
1623 
1624 	do {
1625 		struct sk_buff *nskb = skb->next;
1626 
1627 		skb->next = nskb->next;
1628 		nskb->next = NULL;
1629 		kfree_skb(nskb);
1630 	} while (skb->next);
1631 
1632 	cb = DEV_GSO_CB(skb);
1633 	if (cb->destructor)
1634 		cb->destructor(skb);
1635 }
1636 
1637 /**
1638  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1639  *	@skb: buffer to segment
1640  *
1641  *	This function segments the given skb and stores the list of segments
1642  *	in skb->next.
1643  */
1644 static int dev_gso_segment(struct sk_buff *skb)
1645 {
1646 	struct net_device *dev = skb->dev;
1647 	struct sk_buff *segs;
1648 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1649 					 NETIF_F_SG : 0);
1650 
1651 	segs = skb_gso_segment(skb, features);
1652 
1653 	/* Verifying header integrity only. */
1654 	if (!segs)
1655 		return 0;
1656 
1657 	if (IS_ERR(segs))
1658 		return PTR_ERR(segs);
1659 
1660 	skb->next = segs;
1661 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1662 	skb->destructor = dev_gso_skb_destructor;
1663 
1664 	return 0;
1665 }
1666 
1667 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1668 			struct netdev_queue *txq)
1669 {
1670 	const struct net_device_ops *ops = dev->netdev_ops;
1671 
1672 	prefetch(&dev->netdev_ops->ndo_start_xmit);
1673 	if (likely(!skb->next)) {
1674 		if (!list_empty(&ptype_all))
1675 			dev_queue_xmit_nit(skb, dev);
1676 
1677 		if (netif_needs_gso(dev, skb)) {
1678 			if (unlikely(dev_gso_segment(skb)))
1679 				goto out_kfree_skb;
1680 			if (skb->next)
1681 				goto gso;
1682 		}
1683 
1684 		return ops->ndo_start_xmit(skb, dev);
1685 	}
1686 
1687 gso:
1688 	do {
1689 		struct sk_buff *nskb = skb->next;
1690 		int rc;
1691 
1692 		skb->next = nskb->next;
1693 		nskb->next = NULL;
1694 		rc = ops->ndo_start_xmit(nskb, dev);
1695 		if (unlikely(rc)) {
1696 			nskb->next = skb->next;
1697 			skb->next = nskb;
1698 			return rc;
1699 		}
1700 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1701 			return NETDEV_TX_BUSY;
1702 	} while (skb->next);
1703 
1704 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1705 
1706 out_kfree_skb:
1707 	kfree_skb(skb);
1708 	return 0;
1709 }
1710 
1711 static u32 simple_tx_hashrnd;
1712 static int simple_tx_hashrnd_initialized = 0;
1713 
1714 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1715 {
1716 	u32 addr1, addr2, ports;
1717 	u32 hash, ihl;
1718 	u8 ip_proto = 0;
1719 
1720 	if (unlikely(!simple_tx_hashrnd_initialized)) {
1721 		get_random_bytes(&simple_tx_hashrnd, 4);
1722 		simple_tx_hashrnd_initialized = 1;
1723 	}
1724 
1725 	switch (skb->protocol) {
1726 	case htons(ETH_P_IP):
1727 		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1728 			ip_proto = ip_hdr(skb)->protocol;
1729 		addr1 = ip_hdr(skb)->saddr;
1730 		addr2 = ip_hdr(skb)->daddr;
1731 		ihl = ip_hdr(skb)->ihl;
1732 		break;
1733 	case htons(ETH_P_IPV6):
1734 		ip_proto = ipv6_hdr(skb)->nexthdr;
1735 		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1736 		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1737 		ihl = (40 >> 2);
1738 		break;
1739 	default:
1740 		return 0;
1741 	}
1742 
1743 
1744 	switch (ip_proto) {
1745 	case IPPROTO_TCP:
1746 	case IPPROTO_UDP:
1747 	case IPPROTO_DCCP:
1748 	case IPPROTO_ESP:
1749 	case IPPROTO_AH:
1750 	case IPPROTO_SCTP:
1751 	case IPPROTO_UDPLITE:
1752 		ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1753 		break;
1754 
1755 	default:
1756 		ports = 0;
1757 		break;
1758 	}
1759 
1760 	hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1761 
1762 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1763 }
1764 
1765 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1766 					struct sk_buff *skb)
1767 {
1768 	const struct net_device_ops *ops = dev->netdev_ops;
1769 	u16 queue_index = 0;
1770 
1771 	if (ops->ndo_select_queue)
1772 		queue_index = ops->ndo_select_queue(dev, skb);
1773 	else if (dev->real_num_tx_queues > 1)
1774 		queue_index = simple_tx_hash(dev, skb);
1775 
1776 	skb_set_queue_mapping(skb, queue_index);
1777 	return netdev_get_tx_queue(dev, queue_index);
1778 }
1779 
1780 /**
1781  *	dev_queue_xmit - transmit a buffer
1782  *	@skb: buffer to transmit
1783  *
1784  *	Queue a buffer for transmission to a network device. The caller must
1785  *	have set the device and priority and built the buffer before calling
1786  *	this function. The function can be called from an interrupt.
1787  *
1788  *	A negative errno code is returned on a failure. A success does not
1789  *	guarantee the frame will be transmitted as it may be dropped due
1790  *	to congestion or traffic shaping.
1791  *
1792  * -----------------------------------------------------------------------------------
1793  *      I notice this method can also return errors from the queue disciplines,
1794  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1795  *      be positive.
1796  *
1797  *      Regardless of the return value, the skb is consumed, so it is currently
1798  *      difficult to retry a send to this method.  (You can bump the ref count
1799  *      before sending to hold a reference for retry if you are careful.)
1800  *
1801  *      When calling this method, interrupts MUST be enabled.  This is because
1802  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1803  *          --BLG
1804  */
1805 int dev_queue_xmit(struct sk_buff *skb)
1806 {
1807 	struct net_device *dev = skb->dev;
1808 	struct netdev_queue *txq;
1809 	struct Qdisc *q;
1810 	int rc = -ENOMEM;
1811 
1812 	/* GSO will handle the following emulations directly. */
1813 	if (netif_needs_gso(dev, skb))
1814 		goto gso;
1815 
1816 	if (skb_shinfo(skb)->frag_list &&
1817 	    !(dev->features & NETIF_F_FRAGLIST) &&
1818 	    __skb_linearize(skb))
1819 		goto out_kfree_skb;
1820 
1821 	/* Fragmented skb is linearized if device does not support SG,
1822 	 * or if at least one of fragments is in highmem and device
1823 	 * does not support DMA from it.
1824 	 */
1825 	if (skb_shinfo(skb)->nr_frags &&
1826 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1827 	    __skb_linearize(skb))
1828 		goto out_kfree_skb;
1829 
1830 	/* If packet is not checksummed and device does not support
1831 	 * checksumming for this protocol, complete checksumming here.
1832 	 */
1833 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1834 		skb_set_transport_header(skb, skb->csum_start -
1835 					      skb_headroom(skb));
1836 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1837 			goto out_kfree_skb;
1838 	}
1839 
1840 gso:
1841 	/* Disable soft irqs for various locks below. Also
1842 	 * stops preemption for RCU.
1843 	 */
1844 	rcu_read_lock_bh();
1845 
1846 	txq = dev_pick_tx(dev, skb);
1847 	q = rcu_dereference(txq->qdisc);
1848 
1849 #ifdef CONFIG_NET_CLS_ACT
1850 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1851 #endif
1852 	if (q->enqueue) {
1853 		spinlock_t *root_lock = qdisc_lock(q);
1854 
1855 		spin_lock(root_lock);
1856 
1857 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1858 			kfree_skb(skb);
1859 			rc = NET_XMIT_DROP;
1860 		} else {
1861 			rc = qdisc_enqueue_root(skb, q);
1862 			qdisc_run(q);
1863 		}
1864 		spin_unlock(root_lock);
1865 
1866 		goto out;
1867 	}
1868 
1869 	/* The device has no queue. Common case for software devices:
1870 	   loopback, all the sorts of tunnels...
1871 
1872 	   Really, it is unlikely that netif_tx_lock protection is necessary
1873 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1874 	   counters.)
1875 	   However, it is possible, that they rely on protection
1876 	   made by us here.
1877 
1878 	   Check this and shot the lock. It is not prone from deadlocks.
1879 	   Either shot noqueue qdisc, it is even simpler 8)
1880 	 */
1881 	if (dev->flags & IFF_UP) {
1882 		int cpu = smp_processor_id(); /* ok because BHs are off */
1883 
1884 		if (txq->xmit_lock_owner != cpu) {
1885 
1886 			HARD_TX_LOCK(dev, txq, cpu);
1887 
1888 			if (!netif_tx_queue_stopped(txq)) {
1889 				rc = 0;
1890 				if (!dev_hard_start_xmit(skb, dev, txq)) {
1891 					HARD_TX_UNLOCK(dev, txq);
1892 					goto out;
1893 				}
1894 			}
1895 			HARD_TX_UNLOCK(dev, txq);
1896 			if (net_ratelimit())
1897 				printk(KERN_CRIT "Virtual device %s asks to "
1898 				       "queue packet!\n", dev->name);
1899 		} else {
1900 			/* Recursion is detected! It is possible,
1901 			 * unfortunately */
1902 			if (net_ratelimit())
1903 				printk(KERN_CRIT "Dead loop on virtual device "
1904 				       "%s, fix it urgently!\n", dev->name);
1905 		}
1906 	}
1907 
1908 	rc = -ENETDOWN;
1909 	rcu_read_unlock_bh();
1910 
1911 out_kfree_skb:
1912 	kfree_skb(skb);
1913 	return rc;
1914 out:
1915 	rcu_read_unlock_bh();
1916 	return rc;
1917 }
1918 
1919 
1920 /*=======================================================================
1921 			Receiver routines
1922   =======================================================================*/
1923 
1924 int netdev_max_backlog __read_mostly = 1000;
1925 int netdev_budget __read_mostly = 300;
1926 int weight_p __read_mostly = 64;            /* old backlog weight */
1927 
1928 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1929 
1930 
1931 /**
1932  *	netif_rx	-	post buffer to the network code
1933  *	@skb: buffer to post
1934  *
1935  *	This function receives a packet from a device driver and queues it for
1936  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1937  *	may be dropped during processing for congestion control or by the
1938  *	protocol layers.
1939  *
1940  *	return values:
1941  *	NET_RX_SUCCESS	(no congestion)
1942  *	NET_RX_DROP     (packet was dropped)
1943  *
1944  */
1945 
1946 int netif_rx(struct sk_buff *skb)
1947 {
1948 	struct softnet_data *queue;
1949 	unsigned long flags;
1950 
1951 	/* if netpoll wants it, pretend we never saw it */
1952 	if (netpoll_rx(skb))
1953 		return NET_RX_DROP;
1954 
1955 	if (!skb->tstamp.tv64)
1956 		net_timestamp(skb);
1957 
1958 	/*
1959 	 * The code is rearranged so that the path is the most
1960 	 * short when CPU is congested, but is still operating.
1961 	 */
1962 	local_irq_save(flags);
1963 	queue = &__get_cpu_var(softnet_data);
1964 
1965 	__get_cpu_var(netdev_rx_stat).total++;
1966 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1967 		if (queue->input_pkt_queue.qlen) {
1968 enqueue:
1969 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1970 			local_irq_restore(flags);
1971 			return NET_RX_SUCCESS;
1972 		}
1973 
1974 		napi_schedule(&queue->backlog);
1975 		goto enqueue;
1976 	}
1977 
1978 	__get_cpu_var(netdev_rx_stat).dropped++;
1979 	local_irq_restore(flags);
1980 
1981 	kfree_skb(skb);
1982 	return NET_RX_DROP;
1983 }
1984 
1985 int netif_rx_ni(struct sk_buff *skb)
1986 {
1987 	int err;
1988 
1989 	preempt_disable();
1990 	err = netif_rx(skb);
1991 	if (local_softirq_pending())
1992 		do_softirq();
1993 	preempt_enable();
1994 
1995 	return err;
1996 }
1997 
1998 EXPORT_SYMBOL(netif_rx_ni);
1999 
2000 static void net_tx_action(struct softirq_action *h)
2001 {
2002 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2003 
2004 	if (sd->completion_queue) {
2005 		struct sk_buff *clist;
2006 
2007 		local_irq_disable();
2008 		clist = sd->completion_queue;
2009 		sd->completion_queue = NULL;
2010 		local_irq_enable();
2011 
2012 		while (clist) {
2013 			struct sk_buff *skb = clist;
2014 			clist = clist->next;
2015 
2016 			WARN_ON(atomic_read(&skb->users));
2017 			__kfree_skb(skb);
2018 		}
2019 	}
2020 
2021 	if (sd->output_queue) {
2022 		struct Qdisc *head;
2023 
2024 		local_irq_disable();
2025 		head = sd->output_queue;
2026 		sd->output_queue = NULL;
2027 		local_irq_enable();
2028 
2029 		while (head) {
2030 			struct Qdisc *q = head;
2031 			spinlock_t *root_lock;
2032 
2033 			head = head->next_sched;
2034 
2035 			root_lock = qdisc_lock(q);
2036 			if (spin_trylock(root_lock)) {
2037 				smp_mb__before_clear_bit();
2038 				clear_bit(__QDISC_STATE_SCHED,
2039 					  &q->state);
2040 				qdisc_run(q);
2041 				spin_unlock(root_lock);
2042 			} else {
2043 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2044 					      &q->state)) {
2045 					__netif_reschedule(q);
2046 				} else {
2047 					smp_mb__before_clear_bit();
2048 					clear_bit(__QDISC_STATE_SCHED,
2049 						  &q->state);
2050 				}
2051 			}
2052 		}
2053 	}
2054 }
2055 
2056 static inline int deliver_skb(struct sk_buff *skb,
2057 			      struct packet_type *pt_prev,
2058 			      struct net_device *orig_dev)
2059 {
2060 	atomic_inc(&skb->users);
2061 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2062 }
2063 
2064 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2065 /* These hooks defined here for ATM */
2066 struct net_bridge;
2067 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2068 						unsigned char *addr);
2069 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2070 
2071 /*
2072  * If bridge module is loaded call bridging hook.
2073  *  returns NULL if packet was consumed.
2074  */
2075 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2076 					struct sk_buff *skb) __read_mostly;
2077 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2078 					    struct packet_type **pt_prev, int *ret,
2079 					    struct net_device *orig_dev)
2080 {
2081 	struct net_bridge_port *port;
2082 
2083 	if (skb->pkt_type == PACKET_LOOPBACK ||
2084 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2085 		return skb;
2086 
2087 	if (*pt_prev) {
2088 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2089 		*pt_prev = NULL;
2090 	}
2091 
2092 	return br_handle_frame_hook(port, skb);
2093 }
2094 #else
2095 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2096 #endif
2097 
2098 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2099 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2100 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2101 
2102 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2103 					     struct packet_type **pt_prev,
2104 					     int *ret,
2105 					     struct net_device *orig_dev)
2106 {
2107 	if (skb->dev->macvlan_port == NULL)
2108 		return skb;
2109 
2110 	if (*pt_prev) {
2111 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2112 		*pt_prev = NULL;
2113 	}
2114 	return macvlan_handle_frame_hook(skb);
2115 }
2116 #else
2117 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2118 #endif
2119 
2120 #ifdef CONFIG_NET_CLS_ACT
2121 /* TODO: Maybe we should just force sch_ingress to be compiled in
2122  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2123  * a compare and 2 stores extra right now if we dont have it on
2124  * but have CONFIG_NET_CLS_ACT
2125  * NOTE: This doesnt stop any functionality; if you dont have
2126  * the ingress scheduler, you just cant add policies on ingress.
2127  *
2128  */
2129 static int ing_filter(struct sk_buff *skb)
2130 {
2131 	struct net_device *dev = skb->dev;
2132 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2133 	struct netdev_queue *rxq;
2134 	int result = TC_ACT_OK;
2135 	struct Qdisc *q;
2136 
2137 	if (MAX_RED_LOOP < ttl++) {
2138 		printk(KERN_WARNING
2139 		       "Redir loop detected Dropping packet (%d->%d)\n",
2140 		       skb->iif, dev->ifindex);
2141 		return TC_ACT_SHOT;
2142 	}
2143 
2144 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2145 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2146 
2147 	rxq = &dev->rx_queue;
2148 
2149 	q = rxq->qdisc;
2150 	if (q != &noop_qdisc) {
2151 		spin_lock(qdisc_lock(q));
2152 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2153 			result = qdisc_enqueue_root(skb, q);
2154 		spin_unlock(qdisc_lock(q));
2155 	}
2156 
2157 	return result;
2158 }
2159 
2160 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2161 					 struct packet_type **pt_prev,
2162 					 int *ret, struct net_device *orig_dev)
2163 {
2164 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2165 		goto out;
2166 
2167 	if (*pt_prev) {
2168 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2169 		*pt_prev = NULL;
2170 	} else {
2171 		/* Huh? Why does turning on AF_PACKET affect this? */
2172 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2173 	}
2174 
2175 	switch (ing_filter(skb)) {
2176 	case TC_ACT_SHOT:
2177 	case TC_ACT_STOLEN:
2178 		kfree_skb(skb);
2179 		return NULL;
2180 	}
2181 
2182 out:
2183 	skb->tc_verd = 0;
2184 	return skb;
2185 }
2186 #endif
2187 
2188 /*
2189  * 	netif_nit_deliver - deliver received packets to network taps
2190  * 	@skb: buffer
2191  *
2192  * 	This function is used to deliver incoming packets to network
2193  * 	taps. It should be used when the normal netif_receive_skb path
2194  * 	is bypassed, for example because of VLAN acceleration.
2195  */
2196 void netif_nit_deliver(struct sk_buff *skb)
2197 {
2198 	struct packet_type *ptype;
2199 
2200 	if (list_empty(&ptype_all))
2201 		return;
2202 
2203 	skb_reset_network_header(skb);
2204 	skb_reset_transport_header(skb);
2205 	skb->mac_len = skb->network_header - skb->mac_header;
2206 
2207 	rcu_read_lock();
2208 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2209 		if (!ptype->dev || ptype->dev == skb->dev)
2210 			deliver_skb(skb, ptype, skb->dev);
2211 	}
2212 	rcu_read_unlock();
2213 }
2214 
2215 /**
2216  *	netif_receive_skb - process receive buffer from network
2217  *	@skb: buffer to process
2218  *
2219  *	netif_receive_skb() is the main receive data processing function.
2220  *	It always succeeds. The buffer may be dropped during processing
2221  *	for congestion control or by the protocol layers.
2222  *
2223  *	This function may only be called from softirq context and interrupts
2224  *	should be enabled.
2225  *
2226  *	Return values (usually ignored):
2227  *	NET_RX_SUCCESS: no congestion
2228  *	NET_RX_DROP: packet was dropped
2229  */
2230 int netif_receive_skb(struct sk_buff *skb)
2231 {
2232 	struct packet_type *ptype, *pt_prev;
2233 	struct net_device *orig_dev;
2234 	struct net_device *null_or_orig;
2235 	int ret = NET_RX_DROP;
2236 	__be16 type;
2237 
2238 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2239 		return NET_RX_SUCCESS;
2240 
2241 	/* if we've gotten here through NAPI, check netpoll */
2242 	if (netpoll_receive_skb(skb))
2243 		return NET_RX_DROP;
2244 
2245 	if (!skb->tstamp.tv64)
2246 		net_timestamp(skb);
2247 
2248 	if (!skb->iif)
2249 		skb->iif = skb->dev->ifindex;
2250 
2251 	null_or_orig = NULL;
2252 	orig_dev = skb->dev;
2253 	if (orig_dev->master) {
2254 		if (skb_bond_should_drop(skb))
2255 			null_or_orig = orig_dev; /* deliver only exact match */
2256 		else
2257 			skb->dev = orig_dev->master;
2258 	}
2259 
2260 	__get_cpu_var(netdev_rx_stat).total++;
2261 
2262 	skb_reset_network_header(skb);
2263 	skb_reset_transport_header(skb);
2264 	skb->mac_len = skb->network_header - skb->mac_header;
2265 
2266 	pt_prev = NULL;
2267 
2268 	rcu_read_lock();
2269 
2270 #ifdef CONFIG_NET_CLS_ACT
2271 	if (skb->tc_verd & TC_NCLS) {
2272 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2273 		goto ncls;
2274 	}
2275 #endif
2276 
2277 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2278 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2279 		    ptype->dev == orig_dev) {
2280 			if (pt_prev)
2281 				ret = deliver_skb(skb, pt_prev, orig_dev);
2282 			pt_prev = ptype;
2283 		}
2284 	}
2285 
2286 #ifdef CONFIG_NET_CLS_ACT
2287 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2288 	if (!skb)
2289 		goto out;
2290 ncls:
2291 #endif
2292 
2293 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2294 	if (!skb)
2295 		goto out;
2296 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2297 	if (!skb)
2298 		goto out;
2299 
2300 	type = skb->protocol;
2301 	list_for_each_entry_rcu(ptype,
2302 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2303 		if (ptype->type == type &&
2304 		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2305 		     ptype->dev == orig_dev)) {
2306 			if (pt_prev)
2307 				ret = deliver_skb(skb, pt_prev, orig_dev);
2308 			pt_prev = ptype;
2309 		}
2310 	}
2311 
2312 	if (pt_prev) {
2313 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2314 	} else {
2315 		kfree_skb(skb);
2316 		/* Jamal, now you will not able to escape explaining
2317 		 * me how you were going to use this. :-)
2318 		 */
2319 		ret = NET_RX_DROP;
2320 	}
2321 
2322 out:
2323 	rcu_read_unlock();
2324 	return ret;
2325 }
2326 
2327 /* Network device is going away, flush any packets still pending  */
2328 static void flush_backlog(void *arg)
2329 {
2330 	struct net_device *dev = arg;
2331 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2332 	struct sk_buff *skb, *tmp;
2333 
2334 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2335 		if (skb->dev == dev) {
2336 			__skb_unlink(skb, &queue->input_pkt_queue);
2337 			kfree_skb(skb);
2338 		}
2339 }
2340 
2341 static int napi_gro_complete(struct sk_buff *skb)
2342 {
2343 	struct packet_type *ptype;
2344 	__be16 type = skb->protocol;
2345 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2346 	int err = -ENOENT;
2347 
2348 	if (NAPI_GRO_CB(skb)->count == 1)
2349 		goto out;
2350 
2351 	rcu_read_lock();
2352 	list_for_each_entry_rcu(ptype, head, list) {
2353 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2354 			continue;
2355 
2356 		err = ptype->gro_complete(skb);
2357 		break;
2358 	}
2359 	rcu_read_unlock();
2360 
2361 	if (err) {
2362 		WARN_ON(&ptype->list == head);
2363 		kfree_skb(skb);
2364 		return NET_RX_SUCCESS;
2365 	}
2366 
2367 out:
2368 	skb_shinfo(skb)->gso_size = 0;
2369 	__skb_push(skb, -skb_network_offset(skb));
2370 	return netif_receive_skb(skb);
2371 }
2372 
2373 void napi_gro_flush(struct napi_struct *napi)
2374 {
2375 	struct sk_buff *skb, *next;
2376 
2377 	for (skb = napi->gro_list; skb; skb = next) {
2378 		next = skb->next;
2379 		skb->next = NULL;
2380 		napi_gro_complete(skb);
2381 	}
2382 
2383 	napi->gro_list = NULL;
2384 }
2385 EXPORT_SYMBOL(napi_gro_flush);
2386 
2387 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2388 {
2389 	struct sk_buff **pp = NULL;
2390 	struct packet_type *ptype;
2391 	__be16 type = skb->protocol;
2392 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2393 	int count = 0;
2394 	int same_flow;
2395 	int mac_len;
2396 	int free;
2397 
2398 	if (!(skb->dev->features & NETIF_F_GRO))
2399 		goto normal;
2400 
2401 	if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2402 		goto normal;
2403 
2404 	rcu_read_lock();
2405 	list_for_each_entry_rcu(ptype, head, list) {
2406 		struct sk_buff *p;
2407 
2408 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2409 			continue;
2410 
2411 		skb_reset_network_header(skb);
2412 		mac_len = skb->network_header - skb->mac_header;
2413 		skb->mac_len = mac_len;
2414 		NAPI_GRO_CB(skb)->same_flow = 0;
2415 		NAPI_GRO_CB(skb)->flush = 0;
2416 		NAPI_GRO_CB(skb)->free = 0;
2417 
2418 		for (p = napi->gro_list; p; p = p->next) {
2419 			count++;
2420 
2421 			if (!NAPI_GRO_CB(p)->same_flow)
2422 				continue;
2423 
2424 			if (p->mac_len != mac_len ||
2425 			    memcmp(skb_mac_header(p), skb_mac_header(skb),
2426 				   mac_len))
2427 				NAPI_GRO_CB(p)->same_flow = 0;
2428 		}
2429 
2430 		pp = ptype->gro_receive(&napi->gro_list, skb);
2431 		break;
2432 	}
2433 	rcu_read_unlock();
2434 
2435 	if (&ptype->list == head)
2436 		goto normal;
2437 
2438 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2439 	free = NAPI_GRO_CB(skb)->free;
2440 
2441 	if (pp) {
2442 		struct sk_buff *nskb = *pp;
2443 
2444 		*pp = nskb->next;
2445 		nskb->next = NULL;
2446 		napi_gro_complete(nskb);
2447 		count--;
2448 	}
2449 
2450 	if (same_flow)
2451 		goto ok;
2452 
2453 	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
2454 		__skb_push(skb, -skb_network_offset(skb));
2455 		goto normal;
2456 	}
2457 
2458 	NAPI_GRO_CB(skb)->count = 1;
2459 	skb_shinfo(skb)->gso_size = skb->len;
2460 	skb->next = napi->gro_list;
2461 	napi->gro_list = skb;
2462 
2463 ok:
2464 	return free;
2465 
2466 normal:
2467 	return -1;
2468 }
2469 EXPORT_SYMBOL(dev_gro_receive);
2470 
2471 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2472 {
2473 	struct sk_buff *p;
2474 
2475 	for (p = napi->gro_list; p; p = p->next) {
2476 		NAPI_GRO_CB(p)->same_flow = 1;
2477 		NAPI_GRO_CB(p)->flush = 0;
2478 	}
2479 
2480 	return dev_gro_receive(napi, skb);
2481 }
2482 
2483 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2484 {
2485 	if (netpoll_receive_skb(skb))
2486 		return NET_RX_DROP;
2487 
2488 	switch (__napi_gro_receive(napi, skb)) {
2489 	case -1:
2490 		return netif_receive_skb(skb);
2491 
2492 	case 1:
2493 		kfree_skb(skb);
2494 		break;
2495 	}
2496 
2497 	return NET_RX_SUCCESS;
2498 }
2499 EXPORT_SYMBOL(napi_gro_receive);
2500 
2501 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2502 {
2503 	__skb_pull(skb, skb_headlen(skb));
2504 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2505 
2506 	napi->skb = skb;
2507 }
2508 EXPORT_SYMBOL(napi_reuse_skb);
2509 
2510 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2511 				  struct napi_gro_fraginfo *info)
2512 {
2513 	struct net_device *dev = napi->dev;
2514 	struct sk_buff *skb = napi->skb;
2515 
2516 	napi->skb = NULL;
2517 
2518 	if (!skb) {
2519 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2520 		if (!skb)
2521 			goto out;
2522 
2523 		skb_reserve(skb, NET_IP_ALIGN);
2524 	}
2525 
2526 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2527 	skb_shinfo(skb)->nr_frags = info->nr_frags;
2528 	memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
2529 
2530 	skb->data_len = info->len;
2531 	skb->len += info->len;
2532 	skb->truesize += info->len;
2533 
2534 	if (!pskb_may_pull(skb, ETH_HLEN)) {
2535 		napi_reuse_skb(napi, skb);
2536 		skb = NULL;
2537 		goto out;
2538 	}
2539 
2540 	skb->protocol = eth_type_trans(skb, dev);
2541 
2542 	skb->ip_summed = info->ip_summed;
2543 	skb->csum = info->csum;
2544 
2545 out:
2546 	return skb;
2547 }
2548 EXPORT_SYMBOL(napi_fraginfo_skb);
2549 
2550 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2551 {
2552 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2553 	int err = NET_RX_DROP;
2554 
2555 	if (!skb)
2556 		goto out;
2557 
2558 	if (netpoll_receive_skb(skb))
2559 		goto out;
2560 
2561 	err = NET_RX_SUCCESS;
2562 
2563 	switch (__napi_gro_receive(napi, skb)) {
2564 	case -1:
2565 		return netif_receive_skb(skb);
2566 
2567 	case 0:
2568 		goto out;
2569 	}
2570 
2571 	napi_reuse_skb(napi, skb);
2572 
2573 out:
2574 	return err;
2575 }
2576 EXPORT_SYMBOL(napi_gro_frags);
2577 
2578 static int process_backlog(struct napi_struct *napi, int quota)
2579 {
2580 	int work = 0;
2581 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2582 	unsigned long start_time = jiffies;
2583 
2584 	napi->weight = weight_p;
2585 	do {
2586 		struct sk_buff *skb;
2587 
2588 		local_irq_disable();
2589 		skb = __skb_dequeue(&queue->input_pkt_queue);
2590 		if (!skb) {
2591 			__napi_complete(napi);
2592 			local_irq_enable();
2593 			break;
2594 		}
2595 		local_irq_enable();
2596 
2597 		napi_gro_receive(napi, skb);
2598 	} while (++work < quota && jiffies == start_time);
2599 
2600 	napi_gro_flush(napi);
2601 
2602 	return work;
2603 }
2604 
2605 /**
2606  * __napi_schedule - schedule for receive
2607  * @n: entry to schedule
2608  *
2609  * The entry's receive function will be scheduled to run
2610  */
2611 void __napi_schedule(struct napi_struct *n)
2612 {
2613 	unsigned long flags;
2614 
2615 	local_irq_save(flags);
2616 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2617 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2618 	local_irq_restore(flags);
2619 }
2620 EXPORT_SYMBOL(__napi_schedule);
2621 
2622 void __napi_complete(struct napi_struct *n)
2623 {
2624 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2625 	BUG_ON(n->gro_list);
2626 
2627 	list_del(&n->poll_list);
2628 	smp_mb__before_clear_bit();
2629 	clear_bit(NAPI_STATE_SCHED, &n->state);
2630 }
2631 EXPORT_SYMBOL(__napi_complete);
2632 
2633 void napi_complete(struct napi_struct *n)
2634 {
2635 	unsigned long flags;
2636 
2637 	/*
2638 	 * don't let napi dequeue from the cpu poll list
2639 	 * just in case its running on a different cpu
2640 	 */
2641 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2642 		return;
2643 
2644 	napi_gro_flush(n);
2645 	local_irq_save(flags);
2646 	__napi_complete(n);
2647 	local_irq_restore(flags);
2648 }
2649 EXPORT_SYMBOL(napi_complete);
2650 
2651 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2652 		    int (*poll)(struct napi_struct *, int), int weight)
2653 {
2654 	INIT_LIST_HEAD(&napi->poll_list);
2655 	napi->gro_list = NULL;
2656 	napi->skb = NULL;
2657 	napi->poll = poll;
2658 	napi->weight = weight;
2659 	list_add(&napi->dev_list, &dev->napi_list);
2660 	napi->dev = dev;
2661 #ifdef CONFIG_NETPOLL
2662 	spin_lock_init(&napi->poll_lock);
2663 	napi->poll_owner = -1;
2664 #endif
2665 	set_bit(NAPI_STATE_SCHED, &napi->state);
2666 }
2667 EXPORT_SYMBOL(netif_napi_add);
2668 
2669 void netif_napi_del(struct napi_struct *napi)
2670 {
2671 	struct sk_buff *skb, *next;
2672 
2673 	list_del_init(&napi->dev_list);
2674 	kfree(napi->skb);
2675 
2676 	for (skb = napi->gro_list; skb; skb = next) {
2677 		next = skb->next;
2678 		skb->next = NULL;
2679 		kfree_skb(skb);
2680 	}
2681 
2682 	napi->gro_list = NULL;
2683 }
2684 EXPORT_SYMBOL(netif_napi_del);
2685 
2686 
2687 static void net_rx_action(struct softirq_action *h)
2688 {
2689 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2690 	unsigned long time_limit = jiffies + 2;
2691 	int budget = netdev_budget;
2692 	void *have;
2693 
2694 	local_irq_disable();
2695 
2696 	while (!list_empty(list)) {
2697 		struct napi_struct *n;
2698 		int work, weight;
2699 
2700 		/* If softirq window is exhuasted then punt.
2701 		 * Allow this to run for 2 jiffies since which will allow
2702 		 * an average latency of 1.5/HZ.
2703 		 */
2704 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2705 			goto softnet_break;
2706 
2707 		local_irq_enable();
2708 
2709 		/* Even though interrupts have been re-enabled, this
2710 		 * access is safe because interrupts can only add new
2711 		 * entries to the tail of this list, and only ->poll()
2712 		 * calls can remove this head entry from the list.
2713 		 */
2714 		n = list_entry(list->next, struct napi_struct, poll_list);
2715 
2716 		have = netpoll_poll_lock(n);
2717 
2718 		weight = n->weight;
2719 
2720 		/* This NAPI_STATE_SCHED test is for avoiding a race
2721 		 * with netpoll's poll_napi().  Only the entity which
2722 		 * obtains the lock and sees NAPI_STATE_SCHED set will
2723 		 * actually make the ->poll() call.  Therefore we avoid
2724 		 * accidently calling ->poll() when NAPI is not scheduled.
2725 		 */
2726 		work = 0;
2727 		if (test_bit(NAPI_STATE_SCHED, &n->state))
2728 			work = n->poll(n, weight);
2729 
2730 		WARN_ON_ONCE(work > weight);
2731 
2732 		budget -= work;
2733 
2734 		local_irq_disable();
2735 
2736 		/* Drivers must not modify the NAPI state if they
2737 		 * consume the entire weight.  In such cases this code
2738 		 * still "owns" the NAPI instance and therefore can
2739 		 * move the instance around on the list at-will.
2740 		 */
2741 		if (unlikely(work == weight)) {
2742 			if (unlikely(napi_disable_pending(n)))
2743 				__napi_complete(n);
2744 			else
2745 				list_move_tail(&n->poll_list, list);
2746 		}
2747 
2748 		netpoll_poll_unlock(have);
2749 	}
2750 out:
2751 	local_irq_enable();
2752 
2753 #ifdef CONFIG_NET_DMA
2754 	/*
2755 	 * There may not be any more sk_buffs coming right now, so push
2756 	 * any pending DMA copies to hardware
2757 	 */
2758 	dma_issue_pending_all();
2759 #endif
2760 
2761 	return;
2762 
2763 softnet_break:
2764 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2765 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2766 	goto out;
2767 }
2768 
2769 static gifconf_func_t * gifconf_list [NPROTO];
2770 
2771 /**
2772  *	register_gifconf	-	register a SIOCGIF handler
2773  *	@family: Address family
2774  *	@gifconf: Function handler
2775  *
2776  *	Register protocol dependent address dumping routines. The handler
2777  *	that is passed must not be freed or reused until it has been replaced
2778  *	by another handler.
2779  */
2780 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2781 {
2782 	if (family >= NPROTO)
2783 		return -EINVAL;
2784 	gifconf_list[family] = gifconf;
2785 	return 0;
2786 }
2787 
2788 
2789 /*
2790  *	Map an interface index to its name (SIOCGIFNAME)
2791  */
2792 
2793 /*
2794  *	We need this ioctl for efficient implementation of the
2795  *	if_indextoname() function required by the IPv6 API.  Without
2796  *	it, we would have to search all the interfaces to find a
2797  *	match.  --pb
2798  */
2799 
2800 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2801 {
2802 	struct net_device *dev;
2803 	struct ifreq ifr;
2804 
2805 	/*
2806 	 *	Fetch the caller's info block.
2807 	 */
2808 
2809 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2810 		return -EFAULT;
2811 
2812 	read_lock(&dev_base_lock);
2813 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2814 	if (!dev) {
2815 		read_unlock(&dev_base_lock);
2816 		return -ENODEV;
2817 	}
2818 
2819 	strcpy(ifr.ifr_name, dev->name);
2820 	read_unlock(&dev_base_lock);
2821 
2822 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2823 		return -EFAULT;
2824 	return 0;
2825 }
2826 
2827 /*
2828  *	Perform a SIOCGIFCONF call. This structure will change
2829  *	size eventually, and there is nothing I can do about it.
2830  *	Thus we will need a 'compatibility mode'.
2831  */
2832 
2833 static int dev_ifconf(struct net *net, char __user *arg)
2834 {
2835 	struct ifconf ifc;
2836 	struct net_device *dev;
2837 	char __user *pos;
2838 	int len;
2839 	int total;
2840 	int i;
2841 
2842 	/*
2843 	 *	Fetch the caller's info block.
2844 	 */
2845 
2846 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2847 		return -EFAULT;
2848 
2849 	pos = ifc.ifc_buf;
2850 	len = ifc.ifc_len;
2851 
2852 	/*
2853 	 *	Loop over the interfaces, and write an info block for each.
2854 	 */
2855 
2856 	total = 0;
2857 	for_each_netdev(net, dev) {
2858 		for (i = 0; i < NPROTO; i++) {
2859 			if (gifconf_list[i]) {
2860 				int done;
2861 				if (!pos)
2862 					done = gifconf_list[i](dev, NULL, 0);
2863 				else
2864 					done = gifconf_list[i](dev, pos + total,
2865 							       len - total);
2866 				if (done < 0)
2867 					return -EFAULT;
2868 				total += done;
2869 			}
2870 		}
2871 	}
2872 
2873 	/*
2874 	 *	All done.  Write the updated control block back to the caller.
2875 	 */
2876 	ifc.ifc_len = total;
2877 
2878 	/*
2879 	 * 	Both BSD and Solaris return 0 here, so we do too.
2880 	 */
2881 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2882 }
2883 
2884 #ifdef CONFIG_PROC_FS
2885 /*
2886  *	This is invoked by the /proc filesystem handler to display a device
2887  *	in detail.
2888  */
2889 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2890 	__acquires(dev_base_lock)
2891 {
2892 	struct net *net = seq_file_net(seq);
2893 	loff_t off;
2894 	struct net_device *dev;
2895 
2896 	read_lock(&dev_base_lock);
2897 	if (!*pos)
2898 		return SEQ_START_TOKEN;
2899 
2900 	off = 1;
2901 	for_each_netdev(net, dev)
2902 		if (off++ == *pos)
2903 			return dev;
2904 
2905 	return NULL;
2906 }
2907 
2908 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2909 {
2910 	struct net *net = seq_file_net(seq);
2911 	++*pos;
2912 	return v == SEQ_START_TOKEN ?
2913 		first_net_device(net) : next_net_device((struct net_device *)v);
2914 }
2915 
2916 void dev_seq_stop(struct seq_file *seq, void *v)
2917 	__releases(dev_base_lock)
2918 {
2919 	read_unlock(&dev_base_lock);
2920 }
2921 
2922 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2923 {
2924 	const struct net_device_stats *stats = dev_get_stats(dev);
2925 
2926 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2927 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2928 		   dev->name, stats->rx_bytes, stats->rx_packets,
2929 		   stats->rx_errors,
2930 		   stats->rx_dropped + stats->rx_missed_errors,
2931 		   stats->rx_fifo_errors,
2932 		   stats->rx_length_errors + stats->rx_over_errors +
2933 		    stats->rx_crc_errors + stats->rx_frame_errors,
2934 		   stats->rx_compressed, stats->multicast,
2935 		   stats->tx_bytes, stats->tx_packets,
2936 		   stats->tx_errors, stats->tx_dropped,
2937 		   stats->tx_fifo_errors, stats->collisions,
2938 		   stats->tx_carrier_errors +
2939 		    stats->tx_aborted_errors +
2940 		    stats->tx_window_errors +
2941 		    stats->tx_heartbeat_errors,
2942 		   stats->tx_compressed);
2943 }
2944 
2945 /*
2946  *	Called from the PROCfs module. This now uses the new arbitrary sized
2947  *	/proc/net interface to create /proc/net/dev
2948  */
2949 static int dev_seq_show(struct seq_file *seq, void *v)
2950 {
2951 	if (v == SEQ_START_TOKEN)
2952 		seq_puts(seq, "Inter-|   Receive                            "
2953 			      "                    |  Transmit\n"
2954 			      " face |bytes    packets errs drop fifo frame "
2955 			      "compressed multicast|bytes    packets errs "
2956 			      "drop fifo colls carrier compressed\n");
2957 	else
2958 		dev_seq_printf_stats(seq, v);
2959 	return 0;
2960 }
2961 
2962 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2963 {
2964 	struct netif_rx_stats *rc = NULL;
2965 
2966 	while (*pos < nr_cpu_ids)
2967 		if (cpu_online(*pos)) {
2968 			rc = &per_cpu(netdev_rx_stat, *pos);
2969 			break;
2970 		} else
2971 			++*pos;
2972 	return rc;
2973 }
2974 
2975 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2976 {
2977 	return softnet_get_online(pos);
2978 }
2979 
2980 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2981 {
2982 	++*pos;
2983 	return softnet_get_online(pos);
2984 }
2985 
2986 static void softnet_seq_stop(struct seq_file *seq, void *v)
2987 {
2988 }
2989 
2990 static int softnet_seq_show(struct seq_file *seq, void *v)
2991 {
2992 	struct netif_rx_stats *s = v;
2993 
2994 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2995 		   s->total, s->dropped, s->time_squeeze, 0,
2996 		   0, 0, 0, 0, /* was fastroute */
2997 		   s->cpu_collision );
2998 	return 0;
2999 }
3000 
3001 static const struct seq_operations dev_seq_ops = {
3002 	.start = dev_seq_start,
3003 	.next  = dev_seq_next,
3004 	.stop  = dev_seq_stop,
3005 	.show  = dev_seq_show,
3006 };
3007 
3008 static int dev_seq_open(struct inode *inode, struct file *file)
3009 {
3010 	return seq_open_net(inode, file, &dev_seq_ops,
3011 			    sizeof(struct seq_net_private));
3012 }
3013 
3014 static const struct file_operations dev_seq_fops = {
3015 	.owner	 = THIS_MODULE,
3016 	.open    = dev_seq_open,
3017 	.read    = seq_read,
3018 	.llseek  = seq_lseek,
3019 	.release = seq_release_net,
3020 };
3021 
3022 static const struct seq_operations softnet_seq_ops = {
3023 	.start = softnet_seq_start,
3024 	.next  = softnet_seq_next,
3025 	.stop  = softnet_seq_stop,
3026 	.show  = softnet_seq_show,
3027 };
3028 
3029 static int softnet_seq_open(struct inode *inode, struct file *file)
3030 {
3031 	return seq_open(file, &softnet_seq_ops);
3032 }
3033 
3034 static const struct file_operations softnet_seq_fops = {
3035 	.owner	 = THIS_MODULE,
3036 	.open    = softnet_seq_open,
3037 	.read    = seq_read,
3038 	.llseek  = seq_lseek,
3039 	.release = seq_release,
3040 };
3041 
3042 static void *ptype_get_idx(loff_t pos)
3043 {
3044 	struct packet_type *pt = NULL;
3045 	loff_t i = 0;
3046 	int t;
3047 
3048 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3049 		if (i == pos)
3050 			return pt;
3051 		++i;
3052 	}
3053 
3054 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3055 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3056 			if (i == pos)
3057 				return pt;
3058 			++i;
3059 		}
3060 	}
3061 	return NULL;
3062 }
3063 
3064 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3065 	__acquires(RCU)
3066 {
3067 	rcu_read_lock();
3068 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3069 }
3070 
3071 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3072 {
3073 	struct packet_type *pt;
3074 	struct list_head *nxt;
3075 	int hash;
3076 
3077 	++*pos;
3078 	if (v == SEQ_START_TOKEN)
3079 		return ptype_get_idx(0);
3080 
3081 	pt = v;
3082 	nxt = pt->list.next;
3083 	if (pt->type == htons(ETH_P_ALL)) {
3084 		if (nxt != &ptype_all)
3085 			goto found;
3086 		hash = 0;
3087 		nxt = ptype_base[0].next;
3088 	} else
3089 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3090 
3091 	while (nxt == &ptype_base[hash]) {
3092 		if (++hash >= PTYPE_HASH_SIZE)
3093 			return NULL;
3094 		nxt = ptype_base[hash].next;
3095 	}
3096 found:
3097 	return list_entry(nxt, struct packet_type, list);
3098 }
3099 
3100 static void ptype_seq_stop(struct seq_file *seq, void *v)
3101 	__releases(RCU)
3102 {
3103 	rcu_read_unlock();
3104 }
3105 
3106 static int ptype_seq_show(struct seq_file *seq, void *v)
3107 {
3108 	struct packet_type *pt = v;
3109 
3110 	if (v == SEQ_START_TOKEN)
3111 		seq_puts(seq, "Type Device      Function\n");
3112 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3113 		if (pt->type == htons(ETH_P_ALL))
3114 			seq_puts(seq, "ALL ");
3115 		else
3116 			seq_printf(seq, "%04x", ntohs(pt->type));
3117 
3118 		seq_printf(seq, " %-8s %pF\n",
3119 			   pt->dev ? pt->dev->name : "", pt->func);
3120 	}
3121 
3122 	return 0;
3123 }
3124 
3125 static const struct seq_operations ptype_seq_ops = {
3126 	.start = ptype_seq_start,
3127 	.next  = ptype_seq_next,
3128 	.stop  = ptype_seq_stop,
3129 	.show  = ptype_seq_show,
3130 };
3131 
3132 static int ptype_seq_open(struct inode *inode, struct file *file)
3133 {
3134 	return seq_open_net(inode, file, &ptype_seq_ops,
3135 			sizeof(struct seq_net_private));
3136 }
3137 
3138 static const struct file_operations ptype_seq_fops = {
3139 	.owner	 = THIS_MODULE,
3140 	.open    = ptype_seq_open,
3141 	.read    = seq_read,
3142 	.llseek  = seq_lseek,
3143 	.release = seq_release_net,
3144 };
3145 
3146 
3147 static int __net_init dev_proc_net_init(struct net *net)
3148 {
3149 	int rc = -ENOMEM;
3150 
3151 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3152 		goto out;
3153 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3154 		goto out_dev;
3155 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3156 		goto out_softnet;
3157 
3158 	if (wext_proc_init(net))
3159 		goto out_ptype;
3160 	rc = 0;
3161 out:
3162 	return rc;
3163 out_ptype:
3164 	proc_net_remove(net, "ptype");
3165 out_softnet:
3166 	proc_net_remove(net, "softnet_stat");
3167 out_dev:
3168 	proc_net_remove(net, "dev");
3169 	goto out;
3170 }
3171 
3172 static void __net_exit dev_proc_net_exit(struct net *net)
3173 {
3174 	wext_proc_exit(net);
3175 
3176 	proc_net_remove(net, "ptype");
3177 	proc_net_remove(net, "softnet_stat");
3178 	proc_net_remove(net, "dev");
3179 }
3180 
3181 static struct pernet_operations __net_initdata dev_proc_ops = {
3182 	.init = dev_proc_net_init,
3183 	.exit = dev_proc_net_exit,
3184 };
3185 
3186 static int __init dev_proc_init(void)
3187 {
3188 	return register_pernet_subsys(&dev_proc_ops);
3189 }
3190 #else
3191 #define dev_proc_init() 0
3192 #endif	/* CONFIG_PROC_FS */
3193 
3194 
3195 /**
3196  *	netdev_set_master	-	set up master/slave pair
3197  *	@slave: slave device
3198  *	@master: new master device
3199  *
3200  *	Changes the master device of the slave. Pass %NULL to break the
3201  *	bonding. The caller must hold the RTNL semaphore. On a failure
3202  *	a negative errno code is returned. On success the reference counts
3203  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3204  *	function returns zero.
3205  */
3206 int netdev_set_master(struct net_device *slave, struct net_device *master)
3207 {
3208 	struct net_device *old = slave->master;
3209 
3210 	ASSERT_RTNL();
3211 
3212 	if (master) {
3213 		if (old)
3214 			return -EBUSY;
3215 		dev_hold(master);
3216 	}
3217 
3218 	slave->master = master;
3219 
3220 	synchronize_net();
3221 
3222 	if (old)
3223 		dev_put(old);
3224 
3225 	if (master)
3226 		slave->flags |= IFF_SLAVE;
3227 	else
3228 		slave->flags &= ~IFF_SLAVE;
3229 
3230 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3231 	return 0;
3232 }
3233 
3234 static void dev_change_rx_flags(struct net_device *dev, int flags)
3235 {
3236 	const struct net_device_ops *ops = dev->netdev_ops;
3237 
3238 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3239 		ops->ndo_change_rx_flags(dev, flags);
3240 }
3241 
3242 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3243 {
3244 	unsigned short old_flags = dev->flags;
3245 	uid_t uid;
3246 	gid_t gid;
3247 
3248 	ASSERT_RTNL();
3249 
3250 	dev->flags |= IFF_PROMISC;
3251 	dev->promiscuity += inc;
3252 	if (dev->promiscuity == 0) {
3253 		/*
3254 		 * Avoid overflow.
3255 		 * If inc causes overflow, untouch promisc and return error.
3256 		 */
3257 		if (inc < 0)
3258 			dev->flags &= ~IFF_PROMISC;
3259 		else {
3260 			dev->promiscuity -= inc;
3261 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3262 				"set promiscuity failed, promiscuity feature "
3263 				"of device might be broken.\n", dev->name);
3264 			return -EOVERFLOW;
3265 		}
3266 	}
3267 	if (dev->flags != old_flags) {
3268 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3269 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3270 							       "left");
3271 		if (audit_enabled) {
3272 			current_uid_gid(&uid, &gid);
3273 			audit_log(current->audit_context, GFP_ATOMIC,
3274 				AUDIT_ANOM_PROMISCUOUS,
3275 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3276 				dev->name, (dev->flags & IFF_PROMISC),
3277 				(old_flags & IFF_PROMISC),
3278 				audit_get_loginuid(current),
3279 				uid, gid,
3280 				audit_get_sessionid(current));
3281 		}
3282 
3283 		dev_change_rx_flags(dev, IFF_PROMISC);
3284 	}
3285 	return 0;
3286 }
3287 
3288 /**
3289  *	dev_set_promiscuity	- update promiscuity count on a device
3290  *	@dev: device
3291  *	@inc: modifier
3292  *
3293  *	Add or remove promiscuity from a device. While the count in the device
3294  *	remains above zero the interface remains promiscuous. Once it hits zero
3295  *	the device reverts back to normal filtering operation. A negative inc
3296  *	value is used to drop promiscuity on the device.
3297  *	Return 0 if successful or a negative errno code on error.
3298  */
3299 int dev_set_promiscuity(struct net_device *dev, int inc)
3300 {
3301 	unsigned short old_flags = dev->flags;
3302 	int err;
3303 
3304 	err = __dev_set_promiscuity(dev, inc);
3305 	if (err < 0)
3306 		return err;
3307 	if (dev->flags != old_flags)
3308 		dev_set_rx_mode(dev);
3309 	return err;
3310 }
3311 
3312 /**
3313  *	dev_set_allmulti	- update allmulti count on a device
3314  *	@dev: device
3315  *	@inc: modifier
3316  *
3317  *	Add or remove reception of all multicast frames to a device. While the
3318  *	count in the device remains above zero the interface remains listening
3319  *	to all interfaces. Once it hits zero the device reverts back to normal
3320  *	filtering operation. A negative @inc value is used to drop the counter
3321  *	when releasing a resource needing all multicasts.
3322  *	Return 0 if successful or a negative errno code on error.
3323  */
3324 
3325 int dev_set_allmulti(struct net_device *dev, int inc)
3326 {
3327 	unsigned short old_flags = dev->flags;
3328 
3329 	ASSERT_RTNL();
3330 
3331 	dev->flags |= IFF_ALLMULTI;
3332 	dev->allmulti += inc;
3333 	if (dev->allmulti == 0) {
3334 		/*
3335 		 * Avoid overflow.
3336 		 * If inc causes overflow, untouch allmulti and return error.
3337 		 */
3338 		if (inc < 0)
3339 			dev->flags &= ~IFF_ALLMULTI;
3340 		else {
3341 			dev->allmulti -= inc;
3342 			printk(KERN_WARNING "%s: allmulti touches roof, "
3343 				"set allmulti failed, allmulti feature of "
3344 				"device might be broken.\n", dev->name);
3345 			return -EOVERFLOW;
3346 		}
3347 	}
3348 	if (dev->flags ^ old_flags) {
3349 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3350 		dev_set_rx_mode(dev);
3351 	}
3352 	return 0;
3353 }
3354 
3355 /*
3356  *	Upload unicast and multicast address lists to device and
3357  *	configure RX filtering. When the device doesn't support unicast
3358  *	filtering it is put in promiscuous mode while unicast addresses
3359  *	are present.
3360  */
3361 void __dev_set_rx_mode(struct net_device *dev)
3362 {
3363 	const struct net_device_ops *ops = dev->netdev_ops;
3364 
3365 	/* dev_open will call this function so the list will stay sane. */
3366 	if (!(dev->flags&IFF_UP))
3367 		return;
3368 
3369 	if (!netif_device_present(dev))
3370 		return;
3371 
3372 	if (ops->ndo_set_rx_mode)
3373 		ops->ndo_set_rx_mode(dev);
3374 	else {
3375 		/* Unicast addresses changes may only happen under the rtnl,
3376 		 * therefore calling __dev_set_promiscuity here is safe.
3377 		 */
3378 		if (dev->uc_count > 0 && !dev->uc_promisc) {
3379 			__dev_set_promiscuity(dev, 1);
3380 			dev->uc_promisc = 1;
3381 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3382 			__dev_set_promiscuity(dev, -1);
3383 			dev->uc_promisc = 0;
3384 		}
3385 
3386 		if (ops->ndo_set_multicast_list)
3387 			ops->ndo_set_multicast_list(dev);
3388 	}
3389 }
3390 
3391 void dev_set_rx_mode(struct net_device *dev)
3392 {
3393 	netif_addr_lock_bh(dev);
3394 	__dev_set_rx_mode(dev);
3395 	netif_addr_unlock_bh(dev);
3396 }
3397 
3398 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3399 		      void *addr, int alen, int glbl)
3400 {
3401 	struct dev_addr_list *da;
3402 
3403 	for (; (da = *list) != NULL; list = &da->next) {
3404 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3405 		    alen == da->da_addrlen) {
3406 			if (glbl) {
3407 				int old_glbl = da->da_gusers;
3408 				da->da_gusers = 0;
3409 				if (old_glbl == 0)
3410 					break;
3411 			}
3412 			if (--da->da_users)
3413 				return 0;
3414 
3415 			*list = da->next;
3416 			kfree(da);
3417 			(*count)--;
3418 			return 0;
3419 		}
3420 	}
3421 	return -ENOENT;
3422 }
3423 
3424 int __dev_addr_add(struct dev_addr_list **list, int *count,
3425 		   void *addr, int alen, int glbl)
3426 {
3427 	struct dev_addr_list *da;
3428 
3429 	for (da = *list; da != NULL; da = da->next) {
3430 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3431 		    da->da_addrlen == alen) {
3432 			if (glbl) {
3433 				int old_glbl = da->da_gusers;
3434 				da->da_gusers = 1;
3435 				if (old_glbl)
3436 					return 0;
3437 			}
3438 			da->da_users++;
3439 			return 0;
3440 		}
3441 	}
3442 
3443 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3444 	if (da == NULL)
3445 		return -ENOMEM;
3446 	memcpy(da->da_addr, addr, alen);
3447 	da->da_addrlen = alen;
3448 	da->da_users = 1;
3449 	da->da_gusers = glbl ? 1 : 0;
3450 	da->next = *list;
3451 	*list = da;
3452 	(*count)++;
3453 	return 0;
3454 }
3455 
3456 /**
3457  *	dev_unicast_delete	- Release secondary unicast address.
3458  *	@dev: device
3459  *	@addr: address to delete
3460  *	@alen: length of @addr
3461  *
3462  *	Release reference to a secondary unicast address and remove it
3463  *	from the device if the reference count drops to zero.
3464  *
3465  * 	The caller must hold the rtnl_mutex.
3466  */
3467 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3468 {
3469 	int err;
3470 
3471 	ASSERT_RTNL();
3472 
3473 	netif_addr_lock_bh(dev);
3474 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3475 	if (!err)
3476 		__dev_set_rx_mode(dev);
3477 	netif_addr_unlock_bh(dev);
3478 	return err;
3479 }
3480 EXPORT_SYMBOL(dev_unicast_delete);
3481 
3482 /**
3483  *	dev_unicast_add		- add a secondary unicast address
3484  *	@dev: device
3485  *	@addr: address to add
3486  *	@alen: length of @addr
3487  *
3488  *	Add a secondary unicast address to the device or increase
3489  *	the reference count if it already exists.
3490  *
3491  *	The caller must hold the rtnl_mutex.
3492  */
3493 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3494 {
3495 	int err;
3496 
3497 	ASSERT_RTNL();
3498 
3499 	netif_addr_lock_bh(dev);
3500 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3501 	if (!err)
3502 		__dev_set_rx_mode(dev);
3503 	netif_addr_unlock_bh(dev);
3504 	return err;
3505 }
3506 EXPORT_SYMBOL(dev_unicast_add);
3507 
3508 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3509 		    struct dev_addr_list **from, int *from_count)
3510 {
3511 	struct dev_addr_list *da, *next;
3512 	int err = 0;
3513 
3514 	da = *from;
3515 	while (da != NULL) {
3516 		next = da->next;
3517 		if (!da->da_synced) {
3518 			err = __dev_addr_add(to, to_count,
3519 					     da->da_addr, da->da_addrlen, 0);
3520 			if (err < 0)
3521 				break;
3522 			da->da_synced = 1;
3523 			da->da_users++;
3524 		} else if (da->da_users == 1) {
3525 			__dev_addr_delete(to, to_count,
3526 					  da->da_addr, da->da_addrlen, 0);
3527 			__dev_addr_delete(from, from_count,
3528 					  da->da_addr, da->da_addrlen, 0);
3529 		}
3530 		da = next;
3531 	}
3532 	return err;
3533 }
3534 
3535 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3536 		       struct dev_addr_list **from, int *from_count)
3537 {
3538 	struct dev_addr_list *da, *next;
3539 
3540 	da = *from;
3541 	while (da != NULL) {
3542 		next = da->next;
3543 		if (da->da_synced) {
3544 			__dev_addr_delete(to, to_count,
3545 					  da->da_addr, da->da_addrlen, 0);
3546 			da->da_synced = 0;
3547 			__dev_addr_delete(from, from_count,
3548 					  da->da_addr, da->da_addrlen, 0);
3549 		}
3550 		da = next;
3551 	}
3552 }
3553 
3554 /**
3555  *	dev_unicast_sync - Synchronize device's unicast list to another device
3556  *	@to: destination device
3557  *	@from: source device
3558  *
3559  *	Add newly added addresses to the destination device and release
3560  *	addresses that have no users left. The source device must be
3561  *	locked by netif_tx_lock_bh.
3562  *
3563  *	This function is intended to be called from the dev->set_rx_mode
3564  *	function of layered software devices.
3565  */
3566 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3567 {
3568 	int err = 0;
3569 
3570 	netif_addr_lock_bh(to);
3571 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3572 			      &from->uc_list, &from->uc_count);
3573 	if (!err)
3574 		__dev_set_rx_mode(to);
3575 	netif_addr_unlock_bh(to);
3576 	return err;
3577 }
3578 EXPORT_SYMBOL(dev_unicast_sync);
3579 
3580 /**
3581  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3582  *	@to: destination device
3583  *	@from: source device
3584  *
3585  *	Remove all addresses that were added to the destination device by
3586  *	dev_unicast_sync(). This function is intended to be called from the
3587  *	dev->stop function of layered software devices.
3588  */
3589 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3590 {
3591 	netif_addr_lock_bh(from);
3592 	netif_addr_lock(to);
3593 
3594 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3595 			  &from->uc_list, &from->uc_count);
3596 	__dev_set_rx_mode(to);
3597 
3598 	netif_addr_unlock(to);
3599 	netif_addr_unlock_bh(from);
3600 }
3601 EXPORT_SYMBOL(dev_unicast_unsync);
3602 
3603 static void __dev_addr_discard(struct dev_addr_list **list)
3604 {
3605 	struct dev_addr_list *tmp;
3606 
3607 	while (*list != NULL) {
3608 		tmp = *list;
3609 		*list = tmp->next;
3610 		if (tmp->da_users > tmp->da_gusers)
3611 			printk("__dev_addr_discard: address leakage! "
3612 			       "da_users=%d\n", tmp->da_users);
3613 		kfree(tmp);
3614 	}
3615 }
3616 
3617 static void dev_addr_discard(struct net_device *dev)
3618 {
3619 	netif_addr_lock_bh(dev);
3620 
3621 	__dev_addr_discard(&dev->uc_list);
3622 	dev->uc_count = 0;
3623 
3624 	__dev_addr_discard(&dev->mc_list);
3625 	dev->mc_count = 0;
3626 
3627 	netif_addr_unlock_bh(dev);
3628 }
3629 
3630 /**
3631  *	dev_get_flags - get flags reported to userspace
3632  *	@dev: device
3633  *
3634  *	Get the combination of flag bits exported through APIs to userspace.
3635  */
3636 unsigned dev_get_flags(const struct net_device *dev)
3637 {
3638 	unsigned flags;
3639 
3640 	flags = (dev->flags & ~(IFF_PROMISC |
3641 				IFF_ALLMULTI |
3642 				IFF_RUNNING |
3643 				IFF_LOWER_UP |
3644 				IFF_DORMANT)) |
3645 		(dev->gflags & (IFF_PROMISC |
3646 				IFF_ALLMULTI));
3647 
3648 	if (netif_running(dev)) {
3649 		if (netif_oper_up(dev))
3650 			flags |= IFF_RUNNING;
3651 		if (netif_carrier_ok(dev))
3652 			flags |= IFF_LOWER_UP;
3653 		if (netif_dormant(dev))
3654 			flags |= IFF_DORMANT;
3655 	}
3656 
3657 	return flags;
3658 }
3659 
3660 /**
3661  *	dev_change_flags - change device settings
3662  *	@dev: device
3663  *	@flags: device state flags
3664  *
3665  *	Change settings on device based state flags. The flags are
3666  *	in the userspace exported format.
3667  */
3668 int dev_change_flags(struct net_device *dev, unsigned flags)
3669 {
3670 	int ret, changes;
3671 	int old_flags = dev->flags;
3672 
3673 	ASSERT_RTNL();
3674 
3675 	/*
3676 	 *	Set the flags on our device.
3677 	 */
3678 
3679 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3680 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3681 			       IFF_AUTOMEDIA)) |
3682 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3683 				    IFF_ALLMULTI));
3684 
3685 	/*
3686 	 *	Load in the correct multicast list now the flags have changed.
3687 	 */
3688 
3689 	if ((old_flags ^ flags) & IFF_MULTICAST)
3690 		dev_change_rx_flags(dev, IFF_MULTICAST);
3691 
3692 	dev_set_rx_mode(dev);
3693 
3694 	/*
3695 	 *	Have we downed the interface. We handle IFF_UP ourselves
3696 	 *	according to user attempts to set it, rather than blindly
3697 	 *	setting it.
3698 	 */
3699 
3700 	ret = 0;
3701 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3702 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3703 
3704 		if (!ret)
3705 			dev_set_rx_mode(dev);
3706 	}
3707 
3708 	if (dev->flags & IFF_UP &&
3709 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3710 					  IFF_VOLATILE)))
3711 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3712 
3713 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3714 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3715 		dev->gflags ^= IFF_PROMISC;
3716 		dev_set_promiscuity(dev, inc);
3717 	}
3718 
3719 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3720 	   is important. Some (broken) drivers set IFF_PROMISC, when
3721 	   IFF_ALLMULTI is requested not asking us and not reporting.
3722 	 */
3723 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3724 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3725 		dev->gflags ^= IFF_ALLMULTI;
3726 		dev_set_allmulti(dev, inc);
3727 	}
3728 
3729 	/* Exclude state transition flags, already notified */
3730 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3731 	if (changes)
3732 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3733 
3734 	return ret;
3735 }
3736 
3737 /**
3738  *	dev_set_mtu - Change maximum transfer unit
3739  *	@dev: device
3740  *	@new_mtu: new transfer unit
3741  *
3742  *	Change the maximum transfer size of the network device.
3743  */
3744 int dev_set_mtu(struct net_device *dev, int new_mtu)
3745 {
3746 	const struct net_device_ops *ops = dev->netdev_ops;
3747 	int err;
3748 
3749 	if (new_mtu == dev->mtu)
3750 		return 0;
3751 
3752 	/*	MTU must be positive.	 */
3753 	if (new_mtu < 0)
3754 		return -EINVAL;
3755 
3756 	if (!netif_device_present(dev))
3757 		return -ENODEV;
3758 
3759 	err = 0;
3760 	if (ops->ndo_change_mtu)
3761 		err = ops->ndo_change_mtu(dev, new_mtu);
3762 	else
3763 		dev->mtu = new_mtu;
3764 
3765 	if (!err && dev->flags & IFF_UP)
3766 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3767 	return err;
3768 }
3769 
3770 /**
3771  *	dev_set_mac_address - Change Media Access Control Address
3772  *	@dev: device
3773  *	@sa: new address
3774  *
3775  *	Change the hardware (MAC) address of the device
3776  */
3777 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3778 {
3779 	const struct net_device_ops *ops = dev->netdev_ops;
3780 	int err;
3781 
3782 	if (!ops->ndo_set_mac_address)
3783 		return -EOPNOTSUPP;
3784 	if (sa->sa_family != dev->type)
3785 		return -EINVAL;
3786 	if (!netif_device_present(dev))
3787 		return -ENODEV;
3788 	err = ops->ndo_set_mac_address(dev, sa);
3789 	if (!err)
3790 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3791 	return err;
3792 }
3793 
3794 /*
3795  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3796  */
3797 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3798 {
3799 	int err;
3800 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3801 
3802 	if (!dev)
3803 		return -ENODEV;
3804 
3805 	switch (cmd) {
3806 		case SIOCGIFFLAGS:	/* Get interface flags */
3807 			ifr->ifr_flags = dev_get_flags(dev);
3808 			return 0;
3809 
3810 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3811 					   (currently unused) */
3812 			ifr->ifr_metric = 0;
3813 			return 0;
3814 
3815 		case SIOCGIFMTU:	/* Get the MTU of a device */
3816 			ifr->ifr_mtu = dev->mtu;
3817 			return 0;
3818 
3819 		case SIOCGIFHWADDR:
3820 			if (!dev->addr_len)
3821 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3822 			else
3823 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3824 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3825 			ifr->ifr_hwaddr.sa_family = dev->type;
3826 			return 0;
3827 
3828 		case SIOCGIFSLAVE:
3829 			err = -EINVAL;
3830 			break;
3831 
3832 		case SIOCGIFMAP:
3833 			ifr->ifr_map.mem_start = dev->mem_start;
3834 			ifr->ifr_map.mem_end   = dev->mem_end;
3835 			ifr->ifr_map.base_addr = dev->base_addr;
3836 			ifr->ifr_map.irq       = dev->irq;
3837 			ifr->ifr_map.dma       = dev->dma;
3838 			ifr->ifr_map.port      = dev->if_port;
3839 			return 0;
3840 
3841 		case SIOCGIFINDEX:
3842 			ifr->ifr_ifindex = dev->ifindex;
3843 			return 0;
3844 
3845 		case SIOCGIFTXQLEN:
3846 			ifr->ifr_qlen = dev->tx_queue_len;
3847 			return 0;
3848 
3849 		default:
3850 			/* dev_ioctl() should ensure this case
3851 			 * is never reached
3852 			 */
3853 			WARN_ON(1);
3854 			err = -EINVAL;
3855 			break;
3856 
3857 	}
3858 	return err;
3859 }
3860 
3861 /*
3862  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3863  */
3864 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3865 {
3866 	int err;
3867 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3868 	const struct net_device_ops *ops;
3869 
3870 	if (!dev)
3871 		return -ENODEV;
3872 
3873 	ops = dev->netdev_ops;
3874 
3875 	switch (cmd) {
3876 		case SIOCSIFFLAGS:	/* Set interface flags */
3877 			return dev_change_flags(dev, ifr->ifr_flags);
3878 
3879 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3880 					   (currently unused) */
3881 			return -EOPNOTSUPP;
3882 
3883 		case SIOCSIFMTU:	/* Set the MTU of a device */
3884 			return dev_set_mtu(dev, ifr->ifr_mtu);
3885 
3886 		case SIOCSIFHWADDR:
3887 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3888 
3889 		case SIOCSIFHWBROADCAST:
3890 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3891 				return -EINVAL;
3892 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3893 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3894 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3895 			return 0;
3896 
3897 		case SIOCSIFMAP:
3898 			if (ops->ndo_set_config) {
3899 				if (!netif_device_present(dev))
3900 					return -ENODEV;
3901 				return ops->ndo_set_config(dev, &ifr->ifr_map);
3902 			}
3903 			return -EOPNOTSUPP;
3904 
3905 		case SIOCADDMULTI:
3906 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3907 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3908 				return -EINVAL;
3909 			if (!netif_device_present(dev))
3910 				return -ENODEV;
3911 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3912 					  dev->addr_len, 1);
3913 
3914 		case SIOCDELMULTI:
3915 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3916 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3917 				return -EINVAL;
3918 			if (!netif_device_present(dev))
3919 				return -ENODEV;
3920 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3921 					     dev->addr_len, 1);
3922 
3923 		case SIOCSIFTXQLEN:
3924 			if (ifr->ifr_qlen < 0)
3925 				return -EINVAL;
3926 			dev->tx_queue_len = ifr->ifr_qlen;
3927 			return 0;
3928 
3929 		case SIOCSIFNAME:
3930 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3931 			return dev_change_name(dev, ifr->ifr_newname);
3932 
3933 		/*
3934 		 *	Unknown or private ioctl
3935 		 */
3936 
3937 		default:
3938 			if ((cmd >= SIOCDEVPRIVATE &&
3939 			    cmd <= SIOCDEVPRIVATE + 15) ||
3940 			    cmd == SIOCBONDENSLAVE ||
3941 			    cmd == SIOCBONDRELEASE ||
3942 			    cmd == SIOCBONDSETHWADDR ||
3943 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3944 			    cmd == SIOCBONDINFOQUERY ||
3945 			    cmd == SIOCBONDCHANGEACTIVE ||
3946 			    cmd == SIOCGMIIPHY ||
3947 			    cmd == SIOCGMIIREG ||
3948 			    cmd == SIOCSMIIREG ||
3949 			    cmd == SIOCBRADDIF ||
3950 			    cmd == SIOCBRDELIF ||
3951 			    cmd == SIOCWANDEV) {
3952 				err = -EOPNOTSUPP;
3953 				if (ops->ndo_do_ioctl) {
3954 					if (netif_device_present(dev))
3955 						err = ops->ndo_do_ioctl(dev, ifr, cmd);
3956 					else
3957 						err = -ENODEV;
3958 				}
3959 			} else
3960 				err = -EINVAL;
3961 
3962 	}
3963 	return err;
3964 }
3965 
3966 /*
3967  *	This function handles all "interface"-type I/O control requests. The actual
3968  *	'doing' part of this is dev_ifsioc above.
3969  */
3970 
3971 /**
3972  *	dev_ioctl	-	network device ioctl
3973  *	@net: the applicable net namespace
3974  *	@cmd: command to issue
3975  *	@arg: pointer to a struct ifreq in user space
3976  *
3977  *	Issue ioctl functions to devices. This is normally called by the
3978  *	user space syscall interfaces but can sometimes be useful for
3979  *	other purposes. The return value is the return from the syscall if
3980  *	positive or a negative errno code on error.
3981  */
3982 
3983 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3984 {
3985 	struct ifreq ifr;
3986 	int ret;
3987 	char *colon;
3988 
3989 	/* One special case: SIOCGIFCONF takes ifconf argument
3990 	   and requires shared lock, because it sleeps writing
3991 	   to user space.
3992 	 */
3993 
3994 	if (cmd == SIOCGIFCONF) {
3995 		rtnl_lock();
3996 		ret = dev_ifconf(net, (char __user *) arg);
3997 		rtnl_unlock();
3998 		return ret;
3999 	}
4000 	if (cmd == SIOCGIFNAME)
4001 		return dev_ifname(net, (struct ifreq __user *)arg);
4002 
4003 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4004 		return -EFAULT;
4005 
4006 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4007 
4008 	colon = strchr(ifr.ifr_name, ':');
4009 	if (colon)
4010 		*colon = 0;
4011 
4012 	/*
4013 	 *	See which interface the caller is talking about.
4014 	 */
4015 
4016 	switch (cmd) {
4017 		/*
4018 		 *	These ioctl calls:
4019 		 *	- can be done by all.
4020 		 *	- atomic and do not require locking.
4021 		 *	- return a value
4022 		 */
4023 		case SIOCGIFFLAGS:
4024 		case SIOCGIFMETRIC:
4025 		case SIOCGIFMTU:
4026 		case SIOCGIFHWADDR:
4027 		case SIOCGIFSLAVE:
4028 		case SIOCGIFMAP:
4029 		case SIOCGIFINDEX:
4030 		case SIOCGIFTXQLEN:
4031 			dev_load(net, ifr.ifr_name);
4032 			read_lock(&dev_base_lock);
4033 			ret = dev_ifsioc_locked(net, &ifr, cmd);
4034 			read_unlock(&dev_base_lock);
4035 			if (!ret) {
4036 				if (colon)
4037 					*colon = ':';
4038 				if (copy_to_user(arg, &ifr,
4039 						 sizeof(struct ifreq)))
4040 					ret = -EFAULT;
4041 			}
4042 			return ret;
4043 
4044 		case SIOCETHTOOL:
4045 			dev_load(net, ifr.ifr_name);
4046 			rtnl_lock();
4047 			ret = dev_ethtool(net, &ifr);
4048 			rtnl_unlock();
4049 			if (!ret) {
4050 				if (colon)
4051 					*colon = ':';
4052 				if (copy_to_user(arg, &ifr,
4053 						 sizeof(struct ifreq)))
4054 					ret = -EFAULT;
4055 			}
4056 			return ret;
4057 
4058 		/*
4059 		 *	These ioctl calls:
4060 		 *	- require superuser power.
4061 		 *	- require strict serialization.
4062 		 *	- return a value
4063 		 */
4064 		case SIOCGMIIPHY:
4065 		case SIOCGMIIREG:
4066 		case SIOCSIFNAME:
4067 			if (!capable(CAP_NET_ADMIN))
4068 				return -EPERM;
4069 			dev_load(net, ifr.ifr_name);
4070 			rtnl_lock();
4071 			ret = dev_ifsioc(net, &ifr, cmd);
4072 			rtnl_unlock();
4073 			if (!ret) {
4074 				if (colon)
4075 					*colon = ':';
4076 				if (copy_to_user(arg, &ifr,
4077 						 sizeof(struct ifreq)))
4078 					ret = -EFAULT;
4079 			}
4080 			return ret;
4081 
4082 		/*
4083 		 *	These ioctl calls:
4084 		 *	- require superuser power.
4085 		 *	- require strict serialization.
4086 		 *	- do not return a value
4087 		 */
4088 		case SIOCSIFFLAGS:
4089 		case SIOCSIFMETRIC:
4090 		case SIOCSIFMTU:
4091 		case SIOCSIFMAP:
4092 		case SIOCSIFHWADDR:
4093 		case SIOCSIFSLAVE:
4094 		case SIOCADDMULTI:
4095 		case SIOCDELMULTI:
4096 		case SIOCSIFHWBROADCAST:
4097 		case SIOCSIFTXQLEN:
4098 		case SIOCSMIIREG:
4099 		case SIOCBONDENSLAVE:
4100 		case SIOCBONDRELEASE:
4101 		case SIOCBONDSETHWADDR:
4102 		case SIOCBONDCHANGEACTIVE:
4103 		case SIOCBRADDIF:
4104 		case SIOCBRDELIF:
4105 			if (!capable(CAP_NET_ADMIN))
4106 				return -EPERM;
4107 			/* fall through */
4108 		case SIOCBONDSLAVEINFOQUERY:
4109 		case SIOCBONDINFOQUERY:
4110 			dev_load(net, ifr.ifr_name);
4111 			rtnl_lock();
4112 			ret = dev_ifsioc(net, &ifr, cmd);
4113 			rtnl_unlock();
4114 			return ret;
4115 
4116 		case SIOCGIFMEM:
4117 			/* Get the per device memory space. We can add this but
4118 			 * currently do not support it */
4119 		case SIOCSIFMEM:
4120 			/* Set the per device memory buffer space.
4121 			 * Not applicable in our case */
4122 		case SIOCSIFLINK:
4123 			return -EINVAL;
4124 
4125 		/*
4126 		 *	Unknown or private ioctl.
4127 		 */
4128 		default:
4129 			if (cmd == SIOCWANDEV ||
4130 			    (cmd >= SIOCDEVPRIVATE &&
4131 			     cmd <= SIOCDEVPRIVATE + 15)) {
4132 				dev_load(net, ifr.ifr_name);
4133 				rtnl_lock();
4134 				ret = dev_ifsioc(net, &ifr, cmd);
4135 				rtnl_unlock();
4136 				if (!ret && copy_to_user(arg, &ifr,
4137 							 sizeof(struct ifreq)))
4138 					ret = -EFAULT;
4139 				return ret;
4140 			}
4141 			/* Take care of Wireless Extensions */
4142 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4143 				return wext_handle_ioctl(net, &ifr, cmd, arg);
4144 			return -EINVAL;
4145 	}
4146 }
4147 
4148 
4149 /**
4150  *	dev_new_index	-	allocate an ifindex
4151  *	@net: the applicable net namespace
4152  *
4153  *	Returns a suitable unique value for a new device interface
4154  *	number.  The caller must hold the rtnl semaphore or the
4155  *	dev_base_lock to be sure it remains unique.
4156  */
4157 static int dev_new_index(struct net *net)
4158 {
4159 	static int ifindex;
4160 	for (;;) {
4161 		if (++ifindex <= 0)
4162 			ifindex = 1;
4163 		if (!__dev_get_by_index(net, ifindex))
4164 			return ifindex;
4165 	}
4166 }
4167 
4168 /* Delayed registration/unregisteration */
4169 static LIST_HEAD(net_todo_list);
4170 
4171 static void net_set_todo(struct net_device *dev)
4172 {
4173 	list_add_tail(&dev->todo_list, &net_todo_list);
4174 }
4175 
4176 static void rollback_registered(struct net_device *dev)
4177 {
4178 	BUG_ON(dev_boot_phase);
4179 	ASSERT_RTNL();
4180 
4181 	/* Some devices call without registering for initialization unwind. */
4182 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4183 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4184 				  "was registered\n", dev->name, dev);
4185 
4186 		WARN_ON(1);
4187 		return;
4188 	}
4189 
4190 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4191 
4192 	/* If device is running, close it first. */
4193 	dev_close(dev);
4194 
4195 	/* And unlink it from device chain. */
4196 	unlist_netdevice(dev);
4197 
4198 	dev->reg_state = NETREG_UNREGISTERING;
4199 
4200 	synchronize_net();
4201 
4202 	/* Shutdown queueing discipline. */
4203 	dev_shutdown(dev);
4204 
4205 
4206 	/* Notify protocols, that we are about to destroy
4207 	   this device. They should clean all the things.
4208 	*/
4209 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4210 
4211 	/*
4212 	 *	Flush the unicast and multicast chains
4213 	 */
4214 	dev_addr_discard(dev);
4215 
4216 	if (dev->netdev_ops->ndo_uninit)
4217 		dev->netdev_ops->ndo_uninit(dev);
4218 
4219 	/* Notifier chain MUST detach us from master device. */
4220 	WARN_ON(dev->master);
4221 
4222 	/* Remove entries from kobject tree */
4223 	netdev_unregister_kobject(dev);
4224 
4225 	synchronize_net();
4226 
4227 	dev_put(dev);
4228 }
4229 
4230 static void __netdev_init_queue_locks_one(struct net_device *dev,
4231 					  struct netdev_queue *dev_queue,
4232 					  void *_unused)
4233 {
4234 	spin_lock_init(&dev_queue->_xmit_lock);
4235 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4236 	dev_queue->xmit_lock_owner = -1;
4237 }
4238 
4239 static void netdev_init_queue_locks(struct net_device *dev)
4240 {
4241 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4242 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4243 }
4244 
4245 unsigned long netdev_fix_features(unsigned long features, const char *name)
4246 {
4247 	/* Fix illegal SG+CSUM combinations. */
4248 	if ((features & NETIF_F_SG) &&
4249 	    !(features & NETIF_F_ALL_CSUM)) {
4250 		if (name)
4251 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4252 			       "checksum feature.\n", name);
4253 		features &= ~NETIF_F_SG;
4254 	}
4255 
4256 	/* TSO requires that SG is present as well. */
4257 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4258 		if (name)
4259 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4260 			       "SG feature.\n", name);
4261 		features &= ~NETIF_F_TSO;
4262 	}
4263 
4264 	if (features & NETIF_F_UFO) {
4265 		if (!(features & NETIF_F_GEN_CSUM)) {
4266 			if (name)
4267 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4268 				       "since no NETIF_F_HW_CSUM feature.\n",
4269 				       name);
4270 			features &= ~NETIF_F_UFO;
4271 		}
4272 
4273 		if (!(features & NETIF_F_SG)) {
4274 			if (name)
4275 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4276 				       "since no NETIF_F_SG feature.\n", name);
4277 			features &= ~NETIF_F_UFO;
4278 		}
4279 	}
4280 
4281 	return features;
4282 }
4283 EXPORT_SYMBOL(netdev_fix_features);
4284 
4285 /* Some devices need to (re-)set their netdev_ops inside
4286  * ->init() or similar.  If that happens, we have to setup
4287  * the compat pointers again.
4288  */
4289 void netdev_resync_ops(struct net_device *dev)
4290 {
4291 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4292 	const struct net_device_ops *ops = dev->netdev_ops;
4293 
4294 	dev->init = ops->ndo_init;
4295 	dev->uninit = ops->ndo_uninit;
4296 	dev->open = ops->ndo_open;
4297 	dev->change_rx_flags = ops->ndo_change_rx_flags;
4298 	dev->set_rx_mode = ops->ndo_set_rx_mode;
4299 	dev->set_multicast_list = ops->ndo_set_multicast_list;
4300 	dev->set_mac_address = ops->ndo_set_mac_address;
4301 	dev->validate_addr = ops->ndo_validate_addr;
4302 	dev->do_ioctl = ops->ndo_do_ioctl;
4303 	dev->set_config = ops->ndo_set_config;
4304 	dev->change_mtu = ops->ndo_change_mtu;
4305 	dev->neigh_setup = ops->ndo_neigh_setup;
4306 	dev->tx_timeout = ops->ndo_tx_timeout;
4307 	dev->get_stats = ops->ndo_get_stats;
4308 	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4309 	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4310 	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4311 #ifdef CONFIG_NET_POLL_CONTROLLER
4312 	dev->poll_controller = ops->ndo_poll_controller;
4313 #endif
4314 #endif
4315 }
4316 EXPORT_SYMBOL(netdev_resync_ops);
4317 
4318 /**
4319  *	register_netdevice	- register a network device
4320  *	@dev: device to register
4321  *
4322  *	Take a completed network device structure and add it to the kernel
4323  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4324  *	chain. 0 is returned on success. A negative errno code is returned
4325  *	on a failure to set up the device, or if the name is a duplicate.
4326  *
4327  *	Callers must hold the rtnl semaphore. You may want
4328  *	register_netdev() instead of this.
4329  *
4330  *	BUGS:
4331  *	The locking appears insufficient to guarantee two parallel registers
4332  *	will not get the same name.
4333  */
4334 
4335 int register_netdevice(struct net_device *dev)
4336 {
4337 	struct hlist_head *head;
4338 	struct hlist_node *p;
4339 	int ret;
4340 	struct net *net = dev_net(dev);
4341 
4342 	BUG_ON(dev_boot_phase);
4343 	ASSERT_RTNL();
4344 
4345 	might_sleep();
4346 
4347 	/* When net_device's are persistent, this will be fatal. */
4348 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4349 	BUG_ON(!net);
4350 
4351 	spin_lock_init(&dev->addr_list_lock);
4352 	netdev_set_addr_lockdep_class(dev);
4353 	netdev_init_queue_locks(dev);
4354 
4355 	dev->iflink = -1;
4356 
4357 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4358 	/* Netdevice_ops API compatiability support.
4359 	 * This is temporary until all network devices are converted.
4360 	 */
4361 	if (dev->netdev_ops) {
4362 		netdev_resync_ops(dev);
4363 	} else {
4364 		char drivername[64];
4365 		pr_info("%s (%s): not using net_device_ops yet\n",
4366 			dev->name, netdev_drivername(dev, drivername, 64));
4367 
4368 		/* This works only because net_device_ops and the
4369 		   compatiablity structure are the same. */
4370 		dev->netdev_ops = (void *) &(dev->init);
4371 	}
4372 #endif
4373 
4374 	/* Init, if this function is available */
4375 	if (dev->netdev_ops->ndo_init) {
4376 		ret = dev->netdev_ops->ndo_init(dev);
4377 		if (ret) {
4378 			if (ret > 0)
4379 				ret = -EIO;
4380 			goto out;
4381 		}
4382 	}
4383 
4384 	if (!dev_valid_name(dev->name)) {
4385 		ret = -EINVAL;
4386 		goto err_uninit;
4387 	}
4388 
4389 	dev->ifindex = dev_new_index(net);
4390 	if (dev->iflink == -1)
4391 		dev->iflink = dev->ifindex;
4392 
4393 	/* Check for existence of name */
4394 	head = dev_name_hash(net, dev->name);
4395 	hlist_for_each(p, head) {
4396 		struct net_device *d
4397 			= hlist_entry(p, struct net_device, name_hlist);
4398 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4399 			ret = -EEXIST;
4400 			goto err_uninit;
4401 		}
4402 	}
4403 
4404 	/* Fix illegal checksum combinations */
4405 	if ((dev->features & NETIF_F_HW_CSUM) &&
4406 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4407 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4408 		       dev->name);
4409 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4410 	}
4411 
4412 	if ((dev->features & NETIF_F_NO_CSUM) &&
4413 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4414 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4415 		       dev->name);
4416 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4417 	}
4418 
4419 	dev->features = netdev_fix_features(dev->features, dev->name);
4420 
4421 	/* Enable software GSO if SG is supported. */
4422 	if (dev->features & NETIF_F_SG)
4423 		dev->features |= NETIF_F_GSO;
4424 
4425 	netdev_initialize_kobject(dev);
4426 	ret = netdev_register_kobject(dev);
4427 	if (ret)
4428 		goto err_uninit;
4429 	dev->reg_state = NETREG_REGISTERED;
4430 
4431 	/*
4432 	 *	Default initial state at registry is that the
4433 	 *	device is present.
4434 	 */
4435 
4436 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4437 
4438 	dev_init_scheduler(dev);
4439 	dev_hold(dev);
4440 	list_netdevice(dev);
4441 
4442 	/* Notify protocols, that a new device appeared. */
4443 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4444 	ret = notifier_to_errno(ret);
4445 	if (ret) {
4446 		rollback_registered(dev);
4447 		dev->reg_state = NETREG_UNREGISTERED;
4448 	}
4449 
4450 out:
4451 	return ret;
4452 
4453 err_uninit:
4454 	if (dev->netdev_ops->ndo_uninit)
4455 		dev->netdev_ops->ndo_uninit(dev);
4456 	goto out;
4457 }
4458 
4459 /**
4460  *	init_dummy_netdev	- init a dummy network device for NAPI
4461  *	@dev: device to init
4462  *
4463  *	This takes a network device structure and initialize the minimum
4464  *	amount of fields so it can be used to schedule NAPI polls without
4465  *	registering a full blown interface. This is to be used by drivers
4466  *	that need to tie several hardware interfaces to a single NAPI
4467  *	poll scheduler due to HW limitations.
4468  */
4469 int init_dummy_netdev(struct net_device *dev)
4470 {
4471 	/* Clear everything. Note we don't initialize spinlocks
4472 	 * are they aren't supposed to be taken by any of the
4473 	 * NAPI code and this dummy netdev is supposed to be
4474 	 * only ever used for NAPI polls
4475 	 */
4476 	memset(dev, 0, sizeof(struct net_device));
4477 
4478 	/* make sure we BUG if trying to hit standard
4479 	 * register/unregister code path
4480 	 */
4481 	dev->reg_state = NETREG_DUMMY;
4482 
4483 	/* initialize the ref count */
4484 	atomic_set(&dev->refcnt, 1);
4485 
4486 	/* NAPI wants this */
4487 	INIT_LIST_HEAD(&dev->napi_list);
4488 
4489 	/* a dummy interface is started by default */
4490 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4491 	set_bit(__LINK_STATE_START, &dev->state);
4492 
4493 	return 0;
4494 }
4495 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4496 
4497 
4498 /**
4499  *	register_netdev	- register a network device
4500  *	@dev: device to register
4501  *
4502  *	Take a completed network device structure and add it to the kernel
4503  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4504  *	chain. 0 is returned on success. A negative errno code is returned
4505  *	on a failure to set up the device, or if the name is a duplicate.
4506  *
4507  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4508  *	and expands the device name if you passed a format string to
4509  *	alloc_netdev.
4510  */
4511 int register_netdev(struct net_device *dev)
4512 {
4513 	int err;
4514 
4515 	rtnl_lock();
4516 
4517 	/*
4518 	 * If the name is a format string the caller wants us to do a
4519 	 * name allocation.
4520 	 */
4521 	if (strchr(dev->name, '%')) {
4522 		err = dev_alloc_name(dev, dev->name);
4523 		if (err < 0)
4524 			goto out;
4525 	}
4526 
4527 	err = register_netdevice(dev);
4528 out:
4529 	rtnl_unlock();
4530 	return err;
4531 }
4532 EXPORT_SYMBOL(register_netdev);
4533 
4534 /*
4535  * netdev_wait_allrefs - wait until all references are gone.
4536  *
4537  * This is called when unregistering network devices.
4538  *
4539  * Any protocol or device that holds a reference should register
4540  * for netdevice notification, and cleanup and put back the
4541  * reference if they receive an UNREGISTER event.
4542  * We can get stuck here if buggy protocols don't correctly
4543  * call dev_put.
4544  */
4545 static void netdev_wait_allrefs(struct net_device *dev)
4546 {
4547 	unsigned long rebroadcast_time, warning_time;
4548 
4549 	rebroadcast_time = warning_time = jiffies;
4550 	while (atomic_read(&dev->refcnt) != 0) {
4551 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4552 			rtnl_lock();
4553 
4554 			/* Rebroadcast unregister notification */
4555 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4556 
4557 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4558 				     &dev->state)) {
4559 				/* We must not have linkwatch events
4560 				 * pending on unregister. If this
4561 				 * happens, we simply run the queue
4562 				 * unscheduled, resulting in a noop
4563 				 * for this device.
4564 				 */
4565 				linkwatch_run_queue();
4566 			}
4567 
4568 			__rtnl_unlock();
4569 
4570 			rebroadcast_time = jiffies;
4571 		}
4572 
4573 		msleep(250);
4574 
4575 		if (time_after(jiffies, warning_time + 10 * HZ)) {
4576 			printk(KERN_EMERG "unregister_netdevice: "
4577 			       "waiting for %s to become free. Usage "
4578 			       "count = %d\n",
4579 			       dev->name, atomic_read(&dev->refcnt));
4580 			warning_time = jiffies;
4581 		}
4582 	}
4583 }
4584 
4585 /* The sequence is:
4586  *
4587  *	rtnl_lock();
4588  *	...
4589  *	register_netdevice(x1);
4590  *	register_netdevice(x2);
4591  *	...
4592  *	unregister_netdevice(y1);
4593  *	unregister_netdevice(y2);
4594  *      ...
4595  *	rtnl_unlock();
4596  *	free_netdev(y1);
4597  *	free_netdev(y2);
4598  *
4599  * We are invoked by rtnl_unlock().
4600  * This allows us to deal with problems:
4601  * 1) We can delete sysfs objects which invoke hotplug
4602  *    without deadlocking with linkwatch via keventd.
4603  * 2) Since we run with the RTNL semaphore not held, we can sleep
4604  *    safely in order to wait for the netdev refcnt to drop to zero.
4605  *
4606  * We must not return until all unregister events added during
4607  * the interval the lock was held have been completed.
4608  */
4609 void netdev_run_todo(void)
4610 {
4611 	struct list_head list;
4612 
4613 	/* Snapshot list, allow later requests */
4614 	list_replace_init(&net_todo_list, &list);
4615 
4616 	__rtnl_unlock();
4617 
4618 	while (!list_empty(&list)) {
4619 		struct net_device *dev
4620 			= list_entry(list.next, struct net_device, todo_list);
4621 		list_del(&dev->todo_list);
4622 
4623 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4624 			printk(KERN_ERR "network todo '%s' but state %d\n",
4625 			       dev->name, dev->reg_state);
4626 			dump_stack();
4627 			continue;
4628 		}
4629 
4630 		dev->reg_state = NETREG_UNREGISTERED;
4631 
4632 		on_each_cpu(flush_backlog, dev, 1);
4633 
4634 		netdev_wait_allrefs(dev);
4635 
4636 		/* paranoia */
4637 		BUG_ON(atomic_read(&dev->refcnt));
4638 		WARN_ON(dev->ip_ptr);
4639 		WARN_ON(dev->ip6_ptr);
4640 		WARN_ON(dev->dn_ptr);
4641 
4642 		if (dev->destructor)
4643 			dev->destructor(dev);
4644 
4645 		/* Free network device */
4646 		kobject_put(&dev->dev.kobj);
4647 	}
4648 }
4649 
4650 /**
4651  *	dev_get_stats	- get network device statistics
4652  *	@dev: device to get statistics from
4653  *
4654  *	Get network statistics from device. The device driver may provide
4655  *	its own method by setting dev->netdev_ops->get_stats; otherwise
4656  *	the internal statistics structure is used.
4657  */
4658 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4659  {
4660 	const struct net_device_ops *ops = dev->netdev_ops;
4661 
4662 	if (ops->ndo_get_stats)
4663 		return ops->ndo_get_stats(dev);
4664 	else
4665 		return &dev->stats;
4666 }
4667 EXPORT_SYMBOL(dev_get_stats);
4668 
4669 static void netdev_init_one_queue(struct net_device *dev,
4670 				  struct netdev_queue *queue,
4671 				  void *_unused)
4672 {
4673 	queue->dev = dev;
4674 }
4675 
4676 static void netdev_init_queues(struct net_device *dev)
4677 {
4678 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4679 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4680 	spin_lock_init(&dev->tx_global_lock);
4681 }
4682 
4683 /**
4684  *	alloc_netdev_mq - allocate network device
4685  *	@sizeof_priv:	size of private data to allocate space for
4686  *	@name:		device name format string
4687  *	@setup:		callback to initialize device
4688  *	@queue_count:	the number of subqueues to allocate
4689  *
4690  *	Allocates a struct net_device with private data area for driver use
4691  *	and performs basic initialization.  Also allocates subquue structs
4692  *	for each queue on the device at the end of the netdevice.
4693  */
4694 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4695 		void (*setup)(struct net_device *), unsigned int queue_count)
4696 {
4697 	struct netdev_queue *tx;
4698 	struct net_device *dev;
4699 	size_t alloc_size;
4700 	void *p;
4701 
4702 	BUG_ON(strlen(name) >= sizeof(dev->name));
4703 
4704 	alloc_size = sizeof(struct net_device);
4705 	if (sizeof_priv) {
4706 		/* ensure 32-byte alignment of private area */
4707 		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4708 		alloc_size += sizeof_priv;
4709 	}
4710 	/* ensure 32-byte alignment of whole construct */
4711 	alloc_size += NETDEV_ALIGN_CONST;
4712 
4713 	p = kzalloc(alloc_size, GFP_KERNEL);
4714 	if (!p) {
4715 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4716 		return NULL;
4717 	}
4718 
4719 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4720 	if (!tx) {
4721 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4722 		       "tx qdiscs.\n");
4723 		kfree(p);
4724 		return NULL;
4725 	}
4726 
4727 	dev = (struct net_device *)
4728 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4729 	dev->padded = (char *)dev - (char *)p;
4730 	dev_net_set(dev, &init_net);
4731 
4732 	dev->_tx = tx;
4733 	dev->num_tx_queues = queue_count;
4734 	dev->real_num_tx_queues = queue_count;
4735 
4736 	dev->gso_max_size = GSO_MAX_SIZE;
4737 
4738 	netdev_init_queues(dev);
4739 
4740 	INIT_LIST_HEAD(&dev->napi_list);
4741 	setup(dev);
4742 	strcpy(dev->name, name);
4743 	return dev;
4744 }
4745 EXPORT_SYMBOL(alloc_netdev_mq);
4746 
4747 /**
4748  *	free_netdev - free network device
4749  *	@dev: device
4750  *
4751  *	This function does the last stage of destroying an allocated device
4752  * 	interface. The reference to the device object is released.
4753  *	If this is the last reference then it will be freed.
4754  */
4755 void free_netdev(struct net_device *dev)
4756 {
4757 	struct napi_struct *p, *n;
4758 
4759 	release_net(dev_net(dev));
4760 
4761 	kfree(dev->_tx);
4762 
4763 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4764 		netif_napi_del(p);
4765 
4766 	/*  Compatibility with error handling in drivers */
4767 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4768 		kfree((char *)dev - dev->padded);
4769 		return;
4770 	}
4771 
4772 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4773 	dev->reg_state = NETREG_RELEASED;
4774 
4775 	/* will free via device release */
4776 	put_device(&dev->dev);
4777 }
4778 
4779 /**
4780  *	synchronize_net -  Synchronize with packet receive processing
4781  *
4782  *	Wait for packets currently being received to be done.
4783  *	Does not block later packets from starting.
4784  */
4785 void synchronize_net(void)
4786 {
4787 	might_sleep();
4788 	synchronize_rcu();
4789 }
4790 
4791 /**
4792  *	unregister_netdevice - remove device from the kernel
4793  *	@dev: device
4794  *
4795  *	This function shuts down a device interface and removes it
4796  *	from the kernel tables.
4797  *
4798  *	Callers must hold the rtnl semaphore.  You may want
4799  *	unregister_netdev() instead of this.
4800  */
4801 
4802 void unregister_netdevice(struct net_device *dev)
4803 {
4804 	ASSERT_RTNL();
4805 
4806 	rollback_registered(dev);
4807 	/* Finish processing unregister after unlock */
4808 	net_set_todo(dev);
4809 }
4810 
4811 /**
4812  *	unregister_netdev - remove device from the kernel
4813  *	@dev: device
4814  *
4815  *	This function shuts down a device interface and removes it
4816  *	from the kernel tables.
4817  *
4818  *	This is just a wrapper for unregister_netdevice that takes
4819  *	the rtnl semaphore.  In general you want to use this and not
4820  *	unregister_netdevice.
4821  */
4822 void unregister_netdev(struct net_device *dev)
4823 {
4824 	rtnl_lock();
4825 	unregister_netdevice(dev);
4826 	rtnl_unlock();
4827 }
4828 
4829 EXPORT_SYMBOL(unregister_netdev);
4830 
4831 /**
4832  *	dev_change_net_namespace - move device to different nethost namespace
4833  *	@dev: device
4834  *	@net: network namespace
4835  *	@pat: If not NULL name pattern to try if the current device name
4836  *	      is already taken in the destination network namespace.
4837  *
4838  *	This function shuts down a device interface and moves it
4839  *	to a new network namespace. On success 0 is returned, on
4840  *	a failure a netagive errno code is returned.
4841  *
4842  *	Callers must hold the rtnl semaphore.
4843  */
4844 
4845 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4846 {
4847 	char buf[IFNAMSIZ];
4848 	const char *destname;
4849 	int err;
4850 
4851 	ASSERT_RTNL();
4852 
4853 	/* Don't allow namespace local devices to be moved. */
4854 	err = -EINVAL;
4855 	if (dev->features & NETIF_F_NETNS_LOCAL)
4856 		goto out;
4857 
4858 #ifdef CONFIG_SYSFS
4859 	/* Don't allow real devices to be moved when sysfs
4860 	 * is enabled.
4861 	 */
4862 	err = -EINVAL;
4863 	if (dev->dev.parent)
4864 		goto out;
4865 #endif
4866 
4867 	/* Ensure the device has been registrered */
4868 	err = -EINVAL;
4869 	if (dev->reg_state != NETREG_REGISTERED)
4870 		goto out;
4871 
4872 	/* Get out if there is nothing todo */
4873 	err = 0;
4874 	if (net_eq(dev_net(dev), net))
4875 		goto out;
4876 
4877 	/* Pick the destination device name, and ensure
4878 	 * we can use it in the destination network namespace.
4879 	 */
4880 	err = -EEXIST;
4881 	destname = dev->name;
4882 	if (__dev_get_by_name(net, destname)) {
4883 		/* We get here if we can't use the current device name */
4884 		if (!pat)
4885 			goto out;
4886 		if (!dev_valid_name(pat))
4887 			goto out;
4888 		if (strchr(pat, '%')) {
4889 			if (__dev_alloc_name(net, pat, buf) < 0)
4890 				goto out;
4891 			destname = buf;
4892 		} else
4893 			destname = pat;
4894 		if (__dev_get_by_name(net, destname))
4895 			goto out;
4896 	}
4897 
4898 	/*
4899 	 * And now a mini version of register_netdevice unregister_netdevice.
4900 	 */
4901 
4902 	/* If device is running close it first. */
4903 	dev_close(dev);
4904 
4905 	/* And unlink it from device chain */
4906 	err = -ENODEV;
4907 	unlist_netdevice(dev);
4908 
4909 	synchronize_net();
4910 
4911 	/* Shutdown queueing discipline. */
4912 	dev_shutdown(dev);
4913 
4914 	/* Notify protocols, that we are about to destroy
4915 	   this device. They should clean all the things.
4916 	*/
4917 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4918 
4919 	/*
4920 	 *	Flush the unicast and multicast chains
4921 	 */
4922 	dev_addr_discard(dev);
4923 
4924 	netdev_unregister_kobject(dev);
4925 
4926 	/* Actually switch the network namespace */
4927 	dev_net_set(dev, net);
4928 
4929 	/* Assign the new device name */
4930 	if (destname != dev->name)
4931 		strcpy(dev->name, destname);
4932 
4933 	/* If there is an ifindex conflict assign a new one */
4934 	if (__dev_get_by_index(net, dev->ifindex)) {
4935 		int iflink = (dev->iflink == dev->ifindex);
4936 		dev->ifindex = dev_new_index(net);
4937 		if (iflink)
4938 			dev->iflink = dev->ifindex;
4939 	}
4940 
4941 	/* Fixup kobjects */
4942 	err = netdev_register_kobject(dev);
4943 	WARN_ON(err);
4944 
4945 	/* Add the device back in the hashes */
4946 	list_netdevice(dev);
4947 
4948 	/* Notify protocols, that a new device appeared. */
4949 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4950 
4951 	synchronize_net();
4952 	err = 0;
4953 out:
4954 	return err;
4955 }
4956 
4957 static int dev_cpu_callback(struct notifier_block *nfb,
4958 			    unsigned long action,
4959 			    void *ocpu)
4960 {
4961 	struct sk_buff **list_skb;
4962 	struct Qdisc **list_net;
4963 	struct sk_buff *skb;
4964 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4965 	struct softnet_data *sd, *oldsd;
4966 
4967 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4968 		return NOTIFY_OK;
4969 
4970 	local_irq_disable();
4971 	cpu = smp_processor_id();
4972 	sd = &per_cpu(softnet_data, cpu);
4973 	oldsd = &per_cpu(softnet_data, oldcpu);
4974 
4975 	/* Find end of our completion_queue. */
4976 	list_skb = &sd->completion_queue;
4977 	while (*list_skb)
4978 		list_skb = &(*list_skb)->next;
4979 	/* Append completion queue from offline CPU. */
4980 	*list_skb = oldsd->completion_queue;
4981 	oldsd->completion_queue = NULL;
4982 
4983 	/* Find end of our output_queue. */
4984 	list_net = &sd->output_queue;
4985 	while (*list_net)
4986 		list_net = &(*list_net)->next_sched;
4987 	/* Append output queue from offline CPU. */
4988 	*list_net = oldsd->output_queue;
4989 	oldsd->output_queue = NULL;
4990 
4991 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4992 	local_irq_enable();
4993 
4994 	/* Process offline CPU's input_pkt_queue */
4995 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4996 		netif_rx(skb);
4997 
4998 	return NOTIFY_OK;
4999 }
5000 
5001 
5002 /**
5003  *	netdev_increment_features - increment feature set by one
5004  *	@all: current feature set
5005  *	@one: new feature set
5006  *	@mask: mask feature set
5007  *
5008  *	Computes a new feature set after adding a device with feature set
5009  *	@one to the master device with current feature set @all.  Will not
5010  *	enable anything that is off in @mask. Returns the new feature set.
5011  */
5012 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5013 					unsigned long mask)
5014 {
5015 	/* If device needs checksumming, downgrade to it. */
5016         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5017 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5018 	else if (mask & NETIF_F_ALL_CSUM) {
5019 		/* If one device supports v4/v6 checksumming, set for all. */
5020 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5021 		    !(all & NETIF_F_GEN_CSUM)) {
5022 			all &= ~NETIF_F_ALL_CSUM;
5023 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5024 		}
5025 
5026 		/* If one device supports hw checksumming, set for all. */
5027 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5028 			all &= ~NETIF_F_ALL_CSUM;
5029 			all |= NETIF_F_HW_CSUM;
5030 		}
5031 	}
5032 
5033 	one |= NETIF_F_ALL_CSUM;
5034 
5035 	one |= all & NETIF_F_ONE_FOR_ALL;
5036 	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5037 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5038 
5039 	return all;
5040 }
5041 EXPORT_SYMBOL(netdev_increment_features);
5042 
5043 static struct hlist_head *netdev_create_hash(void)
5044 {
5045 	int i;
5046 	struct hlist_head *hash;
5047 
5048 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5049 	if (hash != NULL)
5050 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5051 			INIT_HLIST_HEAD(&hash[i]);
5052 
5053 	return hash;
5054 }
5055 
5056 /* Initialize per network namespace state */
5057 static int __net_init netdev_init(struct net *net)
5058 {
5059 	INIT_LIST_HEAD(&net->dev_base_head);
5060 
5061 	net->dev_name_head = netdev_create_hash();
5062 	if (net->dev_name_head == NULL)
5063 		goto err_name;
5064 
5065 	net->dev_index_head = netdev_create_hash();
5066 	if (net->dev_index_head == NULL)
5067 		goto err_idx;
5068 
5069 	return 0;
5070 
5071 err_idx:
5072 	kfree(net->dev_name_head);
5073 err_name:
5074 	return -ENOMEM;
5075 }
5076 
5077 /**
5078  *	netdev_drivername - network driver for the device
5079  *	@dev: network device
5080  *	@buffer: buffer for resulting name
5081  *	@len: size of buffer
5082  *
5083  *	Determine network driver for device.
5084  */
5085 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5086 {
5087 	const struct device_driver *driver;
5088 	const struct device *parent;
5089 
5090 	if (len <= 0 || !buffer)
5091 		return buffer;
5092 	buffer[0] = 0;
5093 
5094 	parent = dev->dev.parent;
5095 
5096 	if (!parent)
5097 		return buffer;
5098 
5099 	driver = parent->driver;
5100 	if (driver && driver->name)
5101 		strlcpy(buffer, driver->name, len);
5102 	return buffer;
5103 }
5104 
5105 static void __net_exit netdev_exit(struct net *net)
5106 {
5107 	kfree(net->dev_name_head);
5108 	kfree(net->dev_index_head);
5109 }
5110 
5111 static struct pernet_operations __net_initdata netdev_net_ops = {
5112 	.init = netdev_init,
5113 	.exit = netdev_exit,
5114 };
5115 
5116 static void __net_exit default_device_exit(struct net *net)
5117 {
5118 	struct net_device *dev;
5119 	/*
5120 	 * Push all migratable of the network devices back to the
5121 	 * initial network namespace
5122 	 */
5123 	rtnl_lock();
5124 restart:
5125 	for_each_netdev(net, dev) {
5126 		int err;
5127 		char fb_name[IFNAMSIZ];
5128 
5129 		/* Ignore unmoveable devices (i.e. loopback) */
5130 		if (dev->features & NETIF_F_NETNS_LOCAL)
5131 			continue;
5132 
5133 		/* Delete virtual devices */
5134 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5135 			dev->rtnl_link_ops->dellink(dev);
5136 			goto restart;
5137 		}
5138 
5139 		/* Push remaing network devices to init_net */
5140 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5141 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5142 		if (err) {
5143 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5144 				__func__, dev->name, err);
5145 			BUG();
5146 		}
5147 		goto restart;
5148 	}
5149 	rtnl_unlock();
5150 }
5151 
5152 static struct pernet_operations __net_initdata default_device_ops = {
5153 	.exit = default_device_exit,
5154 };
5155 
5156 /*
5157  *	Initialize the DEV module. At boot time this walks the device list and
5158  *	unhooks any devices that fail to initialise (normally hardware not
5159  *	present) and leaves us with a valid list of present and active devices.
5160  *
5161  */
5162 
5163 /*
5164  *       This is called single threaded during boot, so no need
5165  *       to take the rtnl semaphore.
5166  */
5167 static int __init net_dev_init(void)
5168 {
5169 	int i, rc = -ENOMEM;
5170 
5171 	BUG_ON(!dev_boot_phase);
5172 
5173 	if (dev_proc_init())
5174 		goto out;
5175 
5176 	if (netdev_kobject_init())
5177 		goto out;
5178 
5179 	INIT_LIST_HEAD(&ptype_all);
5180 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5181 		INIT_LIST_HEAD(&ptype_base[i]);
5182 
5183 	if (register_pernet_subsys(&netdev_net_ops))
5184 		goto out;
5185 
5186 	/*
5187 	 *	Initialise the packet receive queues.
5188 	 */
5189 
5190 	for_each_possible_cpu(i) {
5191 		struct softnet_data *queue;
5192 
5193 		queue = &per_cpu(softnet_data, i);
5194 		skb_queue_head_init(&queue->input_pkt_queue);
5195 		queue->completion_queue = NULL;
5196 		INIT_LIST_HEAD(&queue->poll_list);
5197 
5198 		queue->backlog.poll = process_backlog;
5199 		queue->backlog.weight = weight_p;
5200 		queue->backlog.gro_list = NULL;
5201 	}
5202 
5203 	dev_boot_phase = 0;
5204 
5205 	/* The loopback device is special if any other network devices
5206 	 * is present in a network namespace the loopback device must
5207 	 * be present. Since we now dynamically allocate and free the
5208 	 * loopback device ensure this invariant is maintained by
5209 	 * keeping the loopback device as the first device on the
5210 	 * list of network devices.  Ensuring the loopback devices
5211 	 * is the first device that appears and the last network device
5212 	 * that disappears.
5213 	 */
5214 	if (register_pernet_device(&loopback_net_ops))
5215 		goto out;
5216 
5217 	if (register_pernet_device(&default_device_ops))
5218 		goto out;
5219 
5220 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5221 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5222 
5223 	hotcpu_notifier(dev_cpu_callback, 0);
5224 	dst_init();
5225 	dev_mcast_init();
5226 	rc = 0;
5227 out:
5228 	return rc;
5229 }
5230 
5231 subsys_initcall(net_dev_init);
5232 
5233 EXPORT_SYMBOL(__dev_get_by_index);
5234 EXPORT_SYMBOL(__dev_get_by_name);
5235 EXPORT_SYMBOL(__dev_remove_pack);
5236 EXPORT_SYMBOL(dev_valid_name);
5237 EXPORT_SYMBOL(dev_add_pack);
5238 EXPORT_SYMBOL(dev_alloc_name);
5239 EXPORT_SYMBOL(dev_close);
5240 EXPORT_SYMBOL(dev_get_by_flags);
5241 EXPORT_SYMBOL(dev_get_by_index);
5242 EXPORT_SYMBOL(dev_get_by_name);
5243 EXPORT_SYMBOL(dev_open);
5244 EXPORT_SYMBOL(dev_queue_xmit);
5245 EXPORT_SYMBOL(dev_remove_pack);
5246 EXPORT_SYMBOL(dev_set_allmulti);
5247 EXPORT_SYMBOL(dev_set_promiscuity);
5248 EXPORT_SYMBOL(dev_change_flags);
5249 EXPORT_SYMBOL(dev_set_mtu);
5250 EXPORT_SYMBOL(dev_set_mac_address);
5251 EXPORT_SYMBOL(free_netdev);
5252 EXPORT_SYMBOL(netdev_boot_setup_check);
5253 EXPORT_SYMBOL(netdev_set_master);
5254 EXPORT_SYMBOL(netdev_state_change);
5255 EXPORT_SYMBOL(netif_receive_skb);
5256 EXPORT_SYMBOL(netif_rx);
5257 EXPORT_SYMBOL(register_gifconf);
5258 EXPORT_SYMBOL(register_netdevice);
5259 EXPORT_SYMBOL(register_netdevice_notifier);
5260 EXPORT_SYMBOL(skb_checksum_help);
5261 EXPORT_SYMBOL(synchronize_net);
5262 EXPORT_SYMBOL(unregister_netdevice);
5263 EXPORT_SYMBOL(unregister_netdevice_notifier);
5264 EXPORT_SYMBOL(net_enable_timestamp);
5265 EXPORT_SYMBOL(net_disable_timestamp);
5266 EXPORT_SYMBOL(dev_get_flags);
5267 
5268 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5269 EXPORT_SYMBOL(br_handle_frame_hook);
5270 EXPORT_SYMBOL(br_fdb_get_hook);
5271 EXPORT_SYMBOL(br_fdb_put_hook);
5272 #endif
5273 
5274 EXPORT_SYMBOL(dev_load);
5275 
5276 EXPORT_PER_CPU_SYMBOL(softnet_data);
5277