xref: /linux/net/core/dev.c (revision 0c93ea4064a209cdc36de8a9a3003d43d08f46f7)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/ethtool.h>
94 #include <linux/notifier.h>
95 #include <linux/skbuff.h>
96 #include <net/net_namespace.h>
97 #include <net/sock.h>
98 #include <linux/rtnetlink.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/stat.h>
102 #include <linux/if_bridge.h>
103 #include <linux/if_macvlan.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 
130 #include "net-sysfs.h"
131 
132 /* Instead of increasing this, you should create a hash table. */
133 #define MAX_GRO_SKBS 8
134 
135 /* This should be increased if a protocol with a bigger head is added. */
136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
137 
138 /*
139  *	The list of packet types we will receive (as opposed to discard)
140  *	and the routines to invoke.
141  *
142  *	Why 16. Because with 16 the only overlap we get on a hash of the
143  *	low nibble of the protocol value is RARP/SNAP/X.25.
144  *
145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
146  *             sure which should go first, but I bet it won't make much
147  *             difference if we are running VLANs.  The good news is that
148  *             this protocol won't be in the list unless compiled in, so
149  *             the average user (w/out VLANs) will not be adversely affected.
150  *             --BLG
151  *
152  *		0800	IP
153  *		8100    802.1Q VLAN
154  *		0001	802.3
155  *		0002	AX.25
156  *		0004	802.2
157  *		8035	RARP
158  *		0005	SNAP
159  *		0805	X.25
160  *		0806	ARP
161  *		8137	IPX
162  *		0009	Localtalk
163  *		86DD	IPv6
164  */
165 
166 #define PTYPE_HASH_SIZE	(16)
167 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
168 
169 static DEFINE_SPINLOCK(ptype_lock);
170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
171 static struct list_head ptype_all __read_mostly;	/* Taps */
172 
173 /*
174  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175  * semaphore.
176  *
177  * Pure readers hold dev_base_lock for reading.
178  *
179  * Writers must hold the rtnl semaphore while they loop through the
180  * dev_base_head list, and hold dev_base_lock for writing when they do the
181  * actual updates.  This allows pure readers to access the list even
182  * while a writer is preparing to update it.
183  *
184  * To put it another way, dev_base_lock is held for writing only to
185  * protect against pure readers; the rtnl semaphore provides the
186  * protection against other writers.
187  *
188  * See, for example usages, register_netdevice() and
189  * unregister_netdevice(), which must be called with the rtnl
190  * semaphore held.
191  */
192 DEFINE_RWLOCK(dev_base_lock);
193 
194 EXPORT_SYMBOL(dev_base_lock);
195 
196 #define NETDEV_HASHBITS	8
197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
198 
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
208 }
209 
210 /* Device list insertion */
211 static int list_netdevice(struct net_device *dev)
212 {
213 	struct net *net = dev_net(dev);
214 
215 	ASSERT_RTNL();
216 
217 	write_lock_bh(&dev_base_lock);
218 	list_add_tail(&dev->dev_list, &net->dev_base_head);
219 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
220 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
221 	write_unlock_bh(&dev_base_lock);
222 	return 0;
223 }
224 
225 /* Device list removal */
226 static void unlist_netdevice(struct net_device *dev)
227 {
228 	ASSERT_RTNL();
229 
230 	/* Unlink dev from the device chain */
231 	write_lock_bh(&dev_base_lock);
232 	list_del(&dev->dev_list);
233 	hlist_del(&dev->name_hlist);
234 	hlist_del(&dev->index_hlist);
235 	write_unlock_bh(&dev_base_lock);
236 }
237 
238 /*
239  *	Our notifier list
240  */
241 
242 static RAW_NOTIFIER_HEAD(netdev_chain);
243 
244 /*
245  *	Device drivers call our routines to queue packets here. We empty the
246  *	queue in the local softnet handler.
247  */
248 
249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
250 
251 #ifdef CONFIG_LOCKDEP
252 /*
253  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
254  * according to dev->type
255  */
256 static const unsigned short netdev_lock_type[] =
257 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
258 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
259 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
260 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
261 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
262 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
263 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
264 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
265 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
266 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
267 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
268 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
269 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
270 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
271 	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
272 
273 static const char *netdev_lock_name[] =
274 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
275 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
276 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
277 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
278 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
279 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
280 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
281 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
282 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
283 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
284 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
285 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
286 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
287 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
288 	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
289 
290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
292 
293 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
294 {
295 	int i;
296 
297 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
298 		if (netdev_lock_type[i] == dev_type)
299 			return i;
300 	/* the last key is used by default */
301 	return ARRAY_SIZE(netdev_lock_type) - 1;
302 }
303 
304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
305 						 unsigned short dev_type)
306 {
307 	int i;
308 
309 	i = netdev_lock_pos(dev_type);
310 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
311 				   netdev_lock_name[i]);
312 }
313 
314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
315 {
316 	int i;
317 
318 	i = netdev_lock_pos(dev->type);
319 	lockdep_set_class_and_name(&dev->addr_list_lock,
320 				   &netdev_addr_lock_key[i],
321 				   netdev_lock_name[i]);
322 }
323 #else
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 }
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330 }
331 #endif
332 
333 /*******************************************************************************
334 
335 		Protocol management and registration routines
336 
337 *******************************************************************************/
338 
339 /*
340  *	Add a protocol ID to the list. Now that the input handler is
341  *	smarter we can dispense with all the messy stuff that used to be
342  *	here.
343  *
344  *	BEWARE!!! Protocol handlers, mangling input packets,
345  *	MUST BE last in hash buckets and checking protocol handlers
346  *	MUST start from promiscuous ptype_all chain in net_bh.
347  *	It is true now, do not change it.
348  *	Explanation follows: if protocol handler, mangling packet, will
349  *	be the first on list, it is not able to sense, that packet
350  *	is cloned and should be copied-on-write, so that it will
351  *	change it and subsequent readers will get broken packet.
352  *							--ANK (980803)
353  */
354 
355 /**
356  *	dev_add_pack - add packet handler
357  *	@pt: packet type declaration
358  *
359  *	Add a protocol handler to the networking stack. The passed &packet_type
360  *	is linked into kernel lists and may not be freed until it has been
361  *	removed from the kernel lists.
362  *
363  *	This call does not sleep therefore it can not
364  *	guarantee all CPU's that are in middle of receiving packets
365  *	will see the new packet type (until the next received packet).
366  */
367 
368 void dev_add_pack(struct packet_type *pt)
369 {
370 	int hash;
371 
372 	spin_lock_bh(&ptype_lock);
373 	if (pt->type == htons(ETH_P_ALL))
374 		list_add_rcu(&pt->list, &ptype_all);
375 	else {
376 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
377 		list_add_rcu(&pt->list, &ptype_base[hash]);
378 	}
379 	spin_unlock_bh(&ptype_lock);
380 }
381 
382 /**
383  *	__dev_remove_pack	 - remove packet handler
384  *	@pt: packet type declaration
385  *
386  *	Remove a protocol handler that was previously added to the kernel
387  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
388  *	from the kernel lists and can be freed or reused once this function
389  *	returns.
390  *
391  *      The packet type might still be in use by receivers
392  *	and must not be freed until after all the CPU's have gone
393  *	through a quiescent state.
394  */
395 void __dev_remove_pack(struct packet_type *pt)
396 {
397 	struct list_head *head;
398 	struct packet_type *pt1;
399 
400 	spin_lock_bh(&ptype_lock);
401 
402 	if (pt->type == htons(ETH_P_ALL))
403 		head = &ptype_all;
404 	else
405 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
406 
407 	list_for_each_entry(pt1, head, list) {
408 		if (pt == pt1) {
409 			list_del_rcu(&pt->list);
410 			goto out;
411 		}
412 	}
413 
414 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
415 out:
416 	spin_unlock_bh(&ptype_lock);
417 }
418 /**
419  *	dev_remove_pack	 - remove packet handler
420  *	@pt: packet type declaration
421  *
422  *	Remove a protocol handler that was previously added to the kernel
423  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
424  *	from the kernel lists and can be freed or reused once this function
425  *	returns.
426  *
427  *	This call sleeps to guarantee that no CPU is looking at the packet
428  *	type after return.
429  */
430 void dev_remove_pack(struct packet_type *pt)
431 {
432 	__dev_remove_pack(pt);
433 
434 	synchronize_net();
435 }
436 
437 /******************************************************************************
438 
439 		      Device Boot-time Settings Routines
440 
441 *******************************************************************************/
442 
443 /* Boot time configuration table */
444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
445 
446 /**
447  *	netdev_boot_setup_add	- add new setup entry
448  *	@name: name of the device
449  *	@map: configured settings for the device
450  *
451  *	Adds new setup entry to the dev_boot_setup list.  The function
452  *	returns 0 on error and 1 on success.  This is a generic routine to
453  *	all netdevices.
454  */
455 static int netdev_boot_setup_add(char *name, struct ifmap *map)
456 {
457 	struct netdev_boot_setup *s;
458 	int i;
459 
460 	s = dev_boot_setup;
461 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
462 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
463 			memset(s[i].name, 0, sizeof(s[i].name));
464 			strlcpy(s[i].name, name, IFNAMSIZ);
465 			memcpy(&s[i].map, map, sizeof(s[i].map));
466 			break;
467 		}
468 	}
469 
470 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
471 }
472 
473 /**
474  *	netdev_boot_setup_check	- check boot time settings
475  *	@dev: the netdevice
476  *
477  * 	Check boot time settings for the device.
478  *	The found settings are set for the device to be used
479  *	later in the device probing.
480  *	Returns 0 if no settings found, 1 if they are.
481  */
482 int netdev_boot_setup_check(struct net_device *dev)
483 {
484 	struct netdev_boot_setup *s = dev_boot_setup;
485 	int i;
486 
487 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
488 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
489 		    !strcmp(dev->name, s[i].name)) {
490 			dev->irq 	= s[i].map.irq;
491 			dev->base_addr 	= s[i].map.base_addr;
492 			dev->mem_start 	= s[i].map.mem_start;
493 			dev->mem_end 	= s[i].map.mem_end;
494 			return 1;
495 		}
496 	}
497 	return 0;
498 }
499 
500 
501 /**
502  *	netdev_boot_base	- get address from boot time settings
503  *	@prefix: prefix for network device
504  *	@unit: id for network device
505  *
506  * 	Check boot time settings for the base address of device.
507  *	The found settings are set for the device to be used
508  *	later in the device probing.
509  *	Returns 0 if no settings found.
510  */
511 unsigned long netdev_boot_base(const char *prefix, int unit)
512 {
513 	const struct netdev_boot_setup *s = dev_boot_setup;
514 	char name[IFNAMSIZ];
515 	int i;
516 
517 	sprintf(name, "%s%d", prefix, unit);
518 
519 	/*
520 	 * If device already registered then return base of 1
521 	 * to indicate not to probe for this interface
522 	 */
523 	if (__dev_get_by_name(&init_net, name))
524 		return 1;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
527 		if (!strcmp(name, s[i].name))
528 			return s[i].map.base_addr;
529 	return 0;
530 }
531 
532 /*
533  * Saves at boot time configured settings for any netdevice.
534  */
535 int __init netdev_boot_setup(char *str)
536 {
537 	int ints[5];
538 	struct ifmap map;
539 
540 	str = get_options(str, ARRAY_SIZE(ints), ints);
541 	if (!str || !*str)
542 		return 0;
543 
544 	/* Save settings */
545 	memset(&map, 0, sizeof(map));
546 	if (ints[0] > 0)
547 		map.irq = ints[1];
548 	if (ints[0] > 1)
549 		map.base_addr = ints[2];
550 	if (ints[0] > 2)
551 		map.mem_start = ints[3];
552 	if (ints[0] > 3)
553 		map.mem_end = ints[4];
554 
555 	/* Add new entry to the list */
556 	return netdev_boot_setup_add(str, &map);
557 }
558 
559 __setup("netdev=", netdev_boot_setup);
560 
561 /*******************************************************************************
562 
563 			    Device Interface Subroutines
564 
565 *******************************************************************************/
566 
567 /**
568  *	__dev_get_by_name	- find a device by its name
569  *	@net: the applicable net namespace
570  *	@name: name to find
571  *
572  *	Find an interface by name. Must be called under RTNL semaphore
573  *	or @dev_base_lock. If the name is found a pointer to the device
574  *	is returned. If the name is not found then %NULL is returned. The
575  *	reference counters are not incremented so the caller must be
576  *	careful with locks.
577  */
578 
579 struct net_device *__dev_get_by_name(struct net *net, const char *name)
580 {
581 	struct hlist_node *p;
582 
583 	hlist_for_each(p, dev_name_hash(net, name)) {
584 		struct net_device *dev
585 			= hlist_entry(p, struct net_device, name_hlist);
586 		if (!strncmp(dev->name, name, IFNAMSIZ))
587 			return dev;
588 	}
589 	return NULL;
590 }
591 
592 /**
593  *	dev_get_by_name		- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. This can be called from any
598  *	context and does its own locking. The returned handle has
599  *	the usage count incremented and the caller must use dev_put() to
600  *	release it when it is no longer needed. %NULL is returned if no
601  *	matching device is found.
602  */
603 
604 struct net_device *dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct net_device *dev;
607 
608 	read_lock(&dev_base_lock);
609 	dev = __dev_get_by_name(net, name);
610 	if (dev)
611 		dev_hold(dev);
612 	read_unlock(&dev_base_lock);
613 	return dev;
614 }
615 
616 /**
617  *	__dev_get_by_index - find a device by its ifindex
618  *	@net: the applicable net namespace
619  *	@ifindex: index of device
620  *
621  *	Search for an interface by index. Returns %NULL if the device
622  *	is not found or a pointer to the device. The device has not
623  *	had its reference counter increased so the caller must be careful
624  *	about locking. The caller must hold either the RTNL semaphore
625  *	or @dev_base_lock.
626  */
627 
628 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
629 {
630 	struct hlist_node *p;
631 
632 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
633 		struct net_device *dev
634 			= hlist_entry(p, struct net_device, index_hlist);
635 		if (dev->ifindex == ifindex)
636 			return dev;
637 	}
638 	return NULL;
639 }
640 
641 
642 /**
643  *	dev_get_by_index - find a device by its ifindex
644  *	@net: the applicable net namespace
645  *	@ifindex: index of device
646  *
647  *	Search for an interface by index. Returns NULL if the device
648  *	is not found or a pointer to the device. The device returned has
649  *	had a reference added and the pointer is safe until the user calls
650  *	dev_put to indicate they have finished with it.
651  */
652 
653 struct net_device *dev_get_by_index(struct net *net, int ifindex)
654 {
655 	struct net_device *dev;
656 
657 	read_lock(&dev_base_lock);
658 	dev = __dev_get_by_index(net, ifindex);
659 	if (dev)
660 		dev_hold(dev);
661 	read_unlock(&dev_base_lock);
662 	return dev;
663 }
664 
665 /**
666  *	dev_getbyhwaddr - find a device by its hardware address
667  *	@net: the applicable net namespace
668  *	@type: media type of device
669  *	@ha: hardware address
670  *
671  *	Search for an interface by MAC address. Returns NULL if the device
672  *	is not found or a pointer to the device. The caller must hold the
673  *	rtnl semaphore. The returned device has not had its ref count increased
674  *	and the caller must therefore be careful about locking
675  *
676  *	BUGS:
677  *	If the API was consistent this would be __dev_get_by_hwaddr
678  */
679 
680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
681 {
682 	struct net_device *dev;
683 
684 	ASSERT_RTNL();
685 
686 	for_each_netdev(net, dev)
687 		if (dev->type == type &&
688 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
689 			return dev;
690 
691 	return NULL;
692 }
693 
694 EXPORT_SYMBOL(dev_getbyhwaddr);
695 
696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
697 {
698 	struct net_device *dev;
699 
700 	ASSERT_RTNL();
701 	for_each_netdev(net, dev)
702 		if (dev->type == type)
703 			return dev;
704 
705 	return NULL;
706 }
707 
708 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
709 
710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
711 {
712 	struct net_device *dev;
713 
714 	rtnl_lock();
715 	dev = __dev_getfirstbyhwtype(net, type);
716 	if (dev)
717 		dev_hold(dev);
718 	rtnl_unlock();
719 	return dev;
720 }
721 
722 EXPORT_SYMBOL(dev_getfirstbyhwtype);
723 
724 /**
725  *	dev_get_by_flags - find any device with given flags
726  *	@net: the applicable net namespace
727  *	@if_flags: IFF_* values
728  *	@mask: bitmask of bits in if_flags to check
729  *
730  *	Search for any interface with the given flags. Returns NULL if a device
731  *	is not found or a pointer to the device. The device returned has
732  *	had a reference added and the pointer is safe until the user calls
733  *	dev_put to indicate they have finished with it.
734  */
735 
736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
737 {
738 	struct net_device *dev, *ret;
739 
740 	ret = NULL;
741 	read_lock(&dev_base_lock);
742 	for_each_netdev(net, dev) {
743 		if (((dev->flags ^ if_flags) & mask) == 0) {
744 			dev_hold(dev);
745 			ret = dev;
746 			break;
747 		}
748 	}
749 	read_unlock(&dev_base_lock);
750 	return ret;
751 }
752 
753 /**
754  *	dev_valid_name - check if name is okay for network device
755  *	@name: name string
756  *
757  *	Network device names need to be valid file names to
758  *	to allow sysfs to work.  We also disallow any kind of
759  *	whitespace.
760  */
761 int dev_valid_name(const char *name)
762 {
763 	if (*name == '\0')
764 		return 0;
765 	if (strlen(name) >= IFNAMSIZ)
766 		return 0;
767 	if (!strcmp(name, ".") || !strcmp(name, ".."))
768 		return 0;
769 
770 	while (*name) {
771 		if (*name == '/' || isspace(*name))
772 			return 0;
773 		name++;
774 	}
775 	return 1;
776 }
777 
778 /**
779  *	__dev_alloc_name - allocate a name for a device
780  *	@net: network namespace to allocate the device name in
781  *	@name: name format string
782  *	@buf:  scratch buffer and result name string
783  *
784  *	Passed a format string - eg "lt%d" it will try and find a suitable
785  *	id. It scans list of devices to build up a free map, then chooses
786  *	the first empty slot. The caller must hold the dev_base or rtnl lock
787  *	while allocating the name and adding the device in order to avoid
788  *	duplicates.
789  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
790  *	Returns the number of the unit assigned or a negative errno code.
791  */
792 
793 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
794 {
795 	int i = 0;
796 	const char *p;
797 	const int max_netdevices = 8*PAGE_SIZE;
798 	unsigned long *inuse;
799 	struct net_device *d;
800 
801 	p = strnchr(name, IFNAMSIZ-1, '%');
802 	if (p) {
803 		/*
804 		 * Verify the string as this thing may have come from
805 		 * the user.  There must be either one "%d" and no other "%"
806 		 * characters.
807 		 */
808 		if (p[1] != 'd' || strchr(p + 2, '%'))
809 			return -EINVAL;
810 
811 		/* Use one page as a bit array of possible slots */
812 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
813 		if (!inuse)
814 			return -ENOMEM;
815 
816 		for_each_netdev(net, d) {
817 			if (!sscanf(d->name, name, &i))
818 				continue;
819 			if (i < 0 || i >= max_netdevices)
820 				continue;
821 
822 			/*  avoid cases where sscanf is not exact inverse of printf */
823 			snprintf(buf, IFNAMSIZ, name, i);
824 			if (!strncmp(buf, d->name, IFNAMSIZ))
825 				set_bit(i, inuse);
826 		}
827 
828 		i = find_first_zero_bit(inuse, max_netdevices);
829 		free_page((unsigned long) inuse);
830 	}
831 
832 	snprintf(buf, IFNAMSIZ, name, i);
833 	if (!__dev_get_by_name(net, buf))
834 		return i;
835 
836 	/* It is possible to run out of possible slots
837 	 * when the name is long and there isn't enough space left
838 	 * for the digits, or if all bits are used.
839 	 */
840 	return -ENFILE;
841 }
842 
843 /**
844  *	dev_alloc_name - allocate a name for a device
845  *	@dev: device
846  *	@name: name format string
847  *
848  *	Passed a format string - eg "lt%d" it will try and find a suitable
849  *	id. It scans list of devices to build up a free map, then chooses
850  *	the first empty slot. The caller must hold the dev_base or rtnl lock
851  *	while allocating the name and adding the device in order to avoid
852  *	duplicates.
853  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
854  *	Returns the number of the unit assigned or a negative errno code.
855  */
856 
857 int dev_alloc_name(struct net_device *dev, const char *name)
858 {
859 	char buf[IFNAMSIZ];
860 	struct net *net;
861 	int ret;
862 
863 	BUG_ON(!dev_net(dev));
864 	net = dev_net(dev);
865 	ret = __dev_alloc_name(net, name, buf);
866 	if (ret >= 0)
867 		strlcpy(dev->name, buf, IFNAMSIZ);
868 	return ret;
869 }
870 
871 
872 /**
873  *	dev_change_name - change name of a device
874  *	@dev: device
875  *	@newname: name (or format string) must be at least IFNAMSIZ
876  *
877  *	Change name of a device, can pass format strings "eth%d".
878  *	for wildcarding.
879  */
880 int dev_change_name(struct net_device *dev, const char *newname)
881 {
882 	char oldname[IFNAMSIZ];
883 	int err = 0;
884 	int ret;
885 	struct net *net;
886 
887 	ASSERT_RTNL();
888 	BUG_ON(!dev_net(dev));
889 
890 	net = dev_net(dev);
891 	if (dev->flags & IFF_UP)
892 		return -EBUSY;
893 
894 	if (!dev_valid_name(newname))
895 		return -EINVAL;
896 
897 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
898 		return 0;
899 
900 	memcpy(oldname, dev->name, IFNAMSIZ);
901 
902 	if (strchr(newname, '%')) {
903 		err = dev_alloc_name(dev, newname);
904 		if (err < 0)
905 			return err;
906 	}
907 	else if (__dev_get_by_name(net, newname))
908 		return -EEXIST;
909 	else
910 		strlcpy(dev->name, newname, IFNAMSIZ);
911 
912 rollback:
913 	/* For now only devices in the initial network namespace
914 	 * are in sysfs.
915 	 */
916 	if (net == &init_net) {
917 		ret = device_rename(&dev->dev, dev->name);
918 		if (ret) {
919 			memcpy(dev->name, oldname, IFNAMSIZ);
920 			return ret;
921 		}
922 	}
923 
924 	write_lock_bh(&dev_base_lock);
925 	hlist_del(&dev->name_hlist);
926 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
927 	write_unlock_bh(&dev_base_lock);
928 
929 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
930 	ret = notifier_to_errno(ret);
931 
932 	if (ret) {
933 		if (err) {
934 			printk(KERN_ERR
935 			       "%s: name change rollback failed: %d.\n",
936 			       dev->name, ret);
937 		} else {
938 			err = ret;
939 			memcpy(dev->name, oldname, IFNAMSIZ);
940 			goto rollback;
941 		}
942 	}
943 
944 	return err;
945 }
946 
947 /**
948  *	dev_set_alias - change ifalias of a device
949  *	@dev: device
950  *	@alias: name up to IFALIASZ
951  *	@len: limit of bytes to copy from info
952  *
953  *	Set ifalias for a device,
954  */
955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
956 {
957 	ASSERT_RTNL();
958 
959 	if (len >= IFALIASZ)
960 		return -EINVAL;
961 
962 	if (!len) {
963 		if (dev->ifalias) {
964 			kfree(dev->ifalias);
965 			dev->ifalias = NULL;
966 		}
967 		return 0;
968 	}
969 
970 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
971 	if (!dev->ifalias)
972 		return -ENOMEM;
973 
974 	strlcpy(dev->ifalias, alias, len+1);
975 	return len;
976 }
977 
978 
979 /**
980  *	netdev_features_change - device changes features
981  *	@dev: device to cause notification
982  *
983  *	Called to indicate a device has changed features.
984  */
985 void netdev_features_change(struct net_device *dev)
986 {
987 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
988 }
989 EXPORT_SYMBOL(netdev_features_change);
990 
991 /**
992  *	netdev_state_change - device changes state
993  *	@dev: device to cause notification
994  *
995  *	Called to indicate a device has changed state. This function calls
996  *	the notifier chains for netdev_chain and sends a NEWLINK message
997  *	to the routing socket.
998  */
999 void netdev_state_change(struct net_device *dev)
1000 {
1001 	if (dev->flags & IFF_UP) {
1002 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004 	}
1005 }
1006 
1007 void netdev_bonding_change(struct net_device *dev)
1008 {
1009 	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010 }
1011 EXPORT_SYMBOL(netdev_bonding_change);
1012 
1013 /**
1014  *	dev_load 	- load a network module
1015  *	@net: the applicable net namespace
1016  *	@name: name of interface
1017  *
1018  *	If a network interface is not present and the process has suitable
1019  *	privileges this function loads the module. If module loading is not
1020  *	available in this kernel then it becomes a nop.
1021  */
1022 
1023 void dev_load(struct net *net, const char *name)
1024 {
1025 	struct net_device *dev;
1026 
1027 	read_lock(&dev_base_lock);
1028 	dev = __dev_get_by_name(net, name);
1029 	read_unlock(&dev_base_lock);
1030 
1031 	if (!dev && capable(CAP_SYS_MODULE))
1032 		request_module("%s", name);
1033 }
1034 
1035 /**
1036  *	dev_open	- prepare an interface for use.
1037  *	@dev:	device to open
1038  *
1039  *	Takes a device from down to up state. The device's private open
1040  *	function is invoked and then the multicast lists are loaded. Finally
1041  *	the device is moved into the up state and a %NETDEV_UP message is
1042  *	sent to the netdev notifier chain.
1043  *
1044  *	Calling this function on an active interface is a nop. On a failure
1045  *	a negative errno code is returned.
1046  */
1047 int dev_open(struct net_device *dev)
1048 {
1049 	const struct net_device_ops *ops = dev->netdev_ops;
1050 	int ret = 0;
1051 
1052 	ASSERT_RTNL();
1053 
1054 	/*
1055 	 *	Is it already up?
1056 	 */
1057 
1058 	if (dev->flags & IFF_UP)
1059 		return 0;
1060 
1061 	/*
1062 	 *	Is it even present?
1063 	 */
1064 	if (!netif_device_present(dev))
1065 		return -ENODEV;
1066 
1067 	/*
1068 	 *	Call device private open method
1069 	 */
1070 	set_bit(__LINK_STATE_START, &dev->state);
1071 
1072 	if (ops->ndo_validate_addr)
1073 		ret = ops->ndo_validate_addr(dev);
1074 
1075 	if (!ret && ops->ndo_open)
1076 		ret = ops->ndo_open(dev);
1077 
1078 	/*
1079 	 *	If it went open OK then:
1080 	 */
1081 
1082 	if (ret)
1083 		clear_bit(__LINK_STATE_START, &dev->state);
1084 	else {
1085 		/*
1086 		 *	Set the flags.
1087 		 */
1088 		dev->flags |= IFF_UP;
1089 
1090 		/*
1091 		 *	Enable NET_DMA
1092 		 */
1093 		net_dmaengine_get();
1094 
1095 		/*
1096 		 *	Initialize multicasting status
1097 		 */
1098 		dev_set_rx_mode(dev);
1099 
1100 		/*
1101 		 *	Wakeup transmit queue engine
1102 		 */
1103 		dev_activate(dev);
1104 
1105 		/*
1106 		 *	... and announce new interface.
1107 		 */
1108 		call_netdevice_notifiers(NETDEV_UP, dev);
1109 	}
1110 
1111 	return ret;
1112 }
1113 
1114 /**
1115  *	dev_close - shutdown an interface.
1116  *	@dev: device to shutdown
1117  *
1118  *	This function moves an active device into down state. A
1119  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1120  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1121  *	chain.
1122  */
1123 int dev_close(struct net_device *dev)
1124 {
1125 	const struct net_device_ops *ops = dev->netdev_ops;
1126 	ASSERT_RTNL();
1127 
1128 	might_sleep();
1129 
1130 	if (!(dev->flags & IFF_UP))
1131 		return 0;
1132 
1133 	/*
1134 	 *	Tell people we are going down, so that they can
1135 	 *	prepare to death, when device is still operating.
1136 	 */
1137 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1138 
1139 	clear_bit(__LINK_STATE_START, &dev->state);
1140 
1141 	/* Synchronize to scheduled poll. We cannot touch poll list,
1142 	 * it can be even on different cpu. So just clear netif_running().
1143 	 *
1144 	 * dev->stop() will invoke napi_disable() on all of it's
1145 	 * napi_struct instances on this device.
1146 	 */
1147 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1148 
1149 	dev_deactivate(dev);
1150 
1151 	/*
1152 	 *	Call the device specific close. This cannot fail.
1153 	 *	Only if device is UP
1154 	 *
1155 	 *	We allow it to be called even after a DETACH hot-plug
1156 	 *	event.
1157 	 */
1158 	if (ops->ndo_stop)
1159 		ops->ndo_stop(dev);
1160 
1161 	/*
1162 	 *	Device is now down.
1163 	 */
1164 
1165 	dev->flags &= ~IFF_UP;
1166 
1167 	/*
1168 	 * Tell people we are down
1169 	 */
1170 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1171 
1172 	/*
1173 	 *	Shutdown NET_DMA
1174 	 */
1175 	net_dmaengine_put();
1176 
1177 	return 0;
1178 }
1179 
1180 
1181 /**
1182  *	dev_disable_lro - disable Large Receive Offload on a device
1183  *	@dev: device
1184  *
1185  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1186  *	called under RTNL.  This is needed if received packets may be
1187  *	forwarded to another interface.
1188  */
1189 void dev_disable_lro(struct net_device *dev)
1190 {
1191 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1192 	    dev->ethtool_ops->set_flags) {
1193 		u32 flags = dev->ethtool_ops->get_flags(dev);
1194 		if (flags & ETH_FLAG_LRO) {
1195 			flags &= ~ETH_FLAG_LRO;
1196 			dev->ethtool_ops->set_flags(dev, flags);
1197 		}
1198 	}
1199 	WARN_ON(dev->features & NETIF_F_LRO);
1200 }
1201 EXPORT_SYMBOL(dev_disable_lro);
1202 
1203 
1204 static int dev_boot_phase = 1;
1205 
1206 /*
1207  *	Device change register/unregister. These are not inline or static
1208  *	as we export them to the world.
1209  */
1210 
1211 /**
1212  *	register_netdevice_notifier - register a network notifier block
1213  *	@nb: notifier
1214  *
1215  *	Register a notifier to be called when network device events occur.
1216  *	The notifier passed is linked into the kernel structures and must
1217  *	not be reused until it has been unregistered. A negative errno code
1218  *	is returned on a failure.
1219  *
1220  * 	When registered all registration and up events are replayed
1221  *	to the new notifier to allow device to have a race free
1222  *	view of the network device list.
1223  */
1224 
1225 int register_netdevice_notifier(struct notifier_block *nb)
1226 {
1227 	struct net_device *dev;
1228 	struct net_device *last;
1229 	struct net *net;
1230 	int err;
1231 
1232 	rtnl_lock();
1233 	err = raw_notifier_chain_register(&netdev_chain, nb);
1234 	if (err)
1235 		goto unlock;
1236 	if (dev_boot_phase)
1237 		goto unlock;
1238 	for_each_net(net) {
1239 		for_each_netdev(net, dev) {
1240 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1241 			err = notifier_to_errno(err);
1242 			if (err)
1243 				goto rollback;
1244 
1245 			if (!(dev->flags & IFF_UP))
1246 				continue;
1247 
1248 			nb->notifier_call(nb, NETDEV_UP, dev);
1249 		}
1250 	}
1251 
1252 unlock:
1253 	rtnl_unlock();
1254 	return err;
1255 
1256 rollback:
1257 	last = dev;
1258 	for_each_net(net) {
1259 		for_each_netdev(net, dev) {
1260 			if (dev == last)
1261 				break;
1262 
1263 			if (dev->flags & IFF_UP) {
1264 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1265 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1266 			}
1267 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1268 		}
1269 	}
1270 
1271 	raw_notifier_chain_unregister(&netdev_chain, nb);
1272 	goto unlock;
1273 }
1274 
1275 /**
1276  *	unregister_netdevice_notifier - unregister a network notifier block
1277  *	@nb: notifier
1278  *
1279  *	Unregister a notifier previously registered by
1280  *	register_netdevice_notifier(). The notifier is unlinked into the
1281  *	kernel structures and may then be reused. A negative errno code
1282  *	is returned on a failure.
1283  */
1284 
1285 int unregister_netdevice_notifier(struct notifier_block *nb)
1286 {
1287 	int err;
1288 
1289 	rtnl_lock();
1290 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1291 	rtnl_unlock();
1292 	return err;
1293 }
1294 
1295 /**
1296  *	call_netdevice_notifiers - call all network notifier blocks
1297  *      @val: value passed unmodified to notifier function
1298  *      @dev: net_device pointer passed unmodified to notifier function
1299  *
1300  *	Call all network notifier blocks.  Parameters and return value
1301  *	are as for raw_notifier_call_chain().
1302  */
1303 
1304 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1305 {
1306 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1307 }
1308 
1309 /* When > 0 there are consumers of rx skb time stamps */
1310 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1311 
1312 void net_enable_timestamp(void)
1313 {
1314 	atomic_inc(&netstamp_needed);
1315 }
1316 
1317 void net_disable_timestamp(void)
1318 {
1319 	atomic_dec(&netstamp_needed);
1320 }
1321 
1322 static inline void net_timestamp(struct sk_buff *skb)
1323 {
1324 	if (atomic_read(&netstamp_needed))
1325 		__net_timestamp(skb);
1326 	else
1327 		skb->tstamp.tv64 = 0;
1328 }
1329 
1330 /*
1331  *	Support routine. Sends outgoing frames to any network
1332  *	taps currently in use.
1333  */
1334 
1335 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1336 {
1337 	struct packet_type *ptype;
1338 
1339 	net_timestamp(skb);
1340 
1341 	rcu_read_lock();
1342 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1343 		/* Never send packets back to the socket
1344 		 * they originated from - MvS (miquels@drinkel.ow.org)
1345 		 */
1346 		if ((ptype->dev == dev || !ptype->dev) &&
1347 		    (ptype->af_packet_priv == NULL ||
1348 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1349 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1350 			if (!skb2)
1351 				break;
1352 
1353 			/* skb->nh should be correctly
1354 			   set by sender, so that the second statement is
1355 			   just protection against buggy protocols.
1356 			 */
1357 			skb_reset_mac_header(skb2);
1358 
1359 			if (skb_network_header(skb2) < skb2->data ||
1360 			    skb2->network_header > skb2->tail) {
1361 				if (net_ratelimit())
1362 					printk(KERN_CRIT "protocol %04x is "
1363 					       "buggy, dev %s\n",
1364 					       skb2->protocol, dev->name);
1365 				skb_reset_network_header(skb2);
1366 			}
1367 
1368 			skb2->transport_header = skb2->network_header;
1369 			skb2->pkt_type = PACKET_OUTGOING;
1370 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1371 		}
1372 	}
1373 	rcu_read_unlock();
1374 }
1375 
1376 
1377 static inline void __netif_reschedule(struct Qdisc *q)
1378 {
1379 	struct softnet_data *sd;
1380 	unsigned long flags;
1381 
1382 	local_irq_save(flags);
1383 	sd = &__get_cpu_var(softnet_data);
1384 	q->next_sched = sd->output_queue;
1385 	sd->output_queue = q;
1386 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1387 	local_irq_restore(flags);
1388 }
1389 
1390 void __netif_schedule(struct Qdisc *q)
1391 {
1392 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1393 		__netif_reschedule(q);
1394 }
1395 EXPORT_SYMBOL(__netif_schedule);
1396 
1397 void dev_kfree_skb_irq(struct sk_buff *skb)
1398 {
1399 	if (atomic_dec_and_test(&skb->users)) {
1400 		struct softnet_data *sd;
1401 		unsigned long flags;
1402 
1403 		local_irq_save(flags);
1404 		sd = &__get_cpu_var(softnet_data);
1405 		skb->next = sd->completion_queue;
1406 		sd->completion_queue = skb;
1407 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1408 		local_irq_restore(flags);
1409 	}
1410 }
1411 EXPORT_SYMBOL(dev_kfree_skb_irq);
1412 
1413 void dev_kfree_skb_any(struct sk_buff *skb)
1414 {
1415 	if (in_irq() || irqs_disabled())
1416 		dev_kfree_skb_irq(skb);
1417 	else
1418 		dev_kfree_skb(skb);
1419 }
1420 EXPORT_SYMBOL(dev_kfree_skb_any);
1421 
1422 
1423 /**
1424  * netif_device_detach - mark device as removed
1425  * @dev: network device
1426  *
1427  * Mark device as removed from system and therefore no longer available.
1428  */
1429 void netif_device_detach(struct net_device *dev)
1430 {
1431 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1432 	    netif_running(dev)) {
1433 		netif_stop_queue(dev);
1434 	}
1435 }
1436 EXPORT_SYMBOL(netif_device_detach);
1437 
1438 /**
1439  * netif_device_attach - mark device as attached
1440  * @dev: network device
1441  *
1442  * Mark device as attached from system and restart if needed.
1443  */
1444 void netif_device_attach(struct net_device *dev)
1445 {
1446 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1447 	    netif_running(dev)) {
1448 		netif_wake_queue(dev);
1449 		__netdev_watchdog_up(dev);
1450 	}
1451 }
1452 EXPORT_SYMBOL(netif_device_attach);
1453 
1454 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1455 {
1456 	return ((features & NETIF_F_GEN_CSUM) ||
1457 		((features & NETIF_F_IP_CSUM) &&
1458 		 protocol == htons(ETH_P_IP)) ||
1459 		((features & NETIF_F_IPV6_CSUM) &&
1460 		 protocol == htons(ETH_P_IPV6)));
1461 }
1462 
1463 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1464 {
1465 	if (can_checksum_protocol(dev->features, skb->protocol))
1466 		return true;
1467 
1468 	if (skb->protocol == htons(ETH_P_8021Q)) {
1469 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1470 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1471 					  veh->h_vlan_encapsulated_proto))
1472 			return true;
1473 	}
1474 
1475 	return false;
1476 }
1477 
1478 /*
1479  * Invalidate hardware checksum when packet is to be mangled, and
1480  * complete checksum manually on outgoing path.
1481  */
1482 int skb_checksum_help(struct sk_buff *skb)
1483 {
1484 	__wsum csum;
1485 	int ret = 0, offset;
1486 
1487 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1488 		goto out_set_summed;
1489 
1490 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1491 		/* Let GSO fix up the checksum. */
1492 		goto out_set_summed;
1493 	}
1494 
1495 	offset = skb->csum_start - skb_headroom(skb);
1496 	BUG_ON(offset >= skb_headlen(skb));
1497 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1498 
1499 	offset += skb->csum_offset;
1500 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1501 
1502 	if (skb_cloned(skb) &&
1503 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1504 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1505 		if (ret)
1506 			goto out;
1507 	}
1508 
1509 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1510 out_set_summed:
1511 	skb->ip_summed = CHECKSUM_NONE;
1512 out:
1513 	return ret;
1514 }
1515 
1516 /**
1517  *	skb_gso_segment - Perform segmentation on skb.
1518  *	@skb: buffer to segment
1519  *	@features: features for the output path (see dev->features)
1520  *
1521  *	This function segments the given skb and returns a list of segments.
1522  *
1523  *	It may return NULL if the skb requires no segmentation.  This is
1524  *	only possible when GSO is used for verifying header integrity.
1525  */
1526 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1527 {
1528 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1529 	struct packet_type *ptype;
1530 	__be16 type = skb->protocol;
1531 	int err;
1532 
1533 	skb_reset_mac_header(skb);
1534 	skb->mac_len = skb->network_header - skb->mac_header;
1535 	__skb_pull(skb, skb->mac_len);
1536 
1537 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1538 		struct net_device *dev = skb->dev;
1539 		struct ethtool_drvinfo info = {};
1540 
1541 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1542 			dev->ethtool_ops->get_drvinfo(dev, &info);
1543 
1544 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1545 			"ip_summed=%d",
1546 		     info.driver, dev ? dev->features : 0L,
1547 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1548 		     skb->len, skb->data_len, skb->ip_summed);
1549 
1550 		if (skb_header_cloned(skb) &&
1551 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1552 			return ERR_PTR(err);
1553 	}
1554 
1555 	rcu_read_lock();
1556 	list_for_each_entry_rcu(ptype,
1557 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1558 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1559 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1560 				err = ptype->gso_send_check(skb);
1561 				segs = ERR_PTR(err);
1562 				if (err || skb_gso_ok(skb, features))
1563 					break;
1564 				__skb_push(skb, (skb->data -
1565 						 skb_network_header(skb)));
1566 			}
1567 			segs = ptype->gso_segment(skb, features);
1568 			break;
1569 		}
1570 	}
1571 	rcu_read_unlock();
1572 
1573 	__skb_push(skb, skb->data - skb_mac_header(skb));
1574 
1575 	return segs;
1576 }
1577 
1578 EXPORT_SYMBOL(skb_gso_segment);
1579 
1580 /* Take action when hardware reception checksum errors are detected. */
1581 #ifdef CONFIG_BUG
1582 void netdev_rx_csum_fault(struct net_device *dev)
1583 {
1584 	if (net_ratelimit()) {
1585 		printk(KERN_ERR "%s: hw csum failure.\n",
1586 			dev ? dev->name : "<unknown>");
1587 		dump_stack();
1588 	}
1589 }
1590 EXPORT_SYMBOL(netdev_rx_csum_fault);
1591 #endif
1592 
1593 /* Actually, we should eliminate this check as soon as we know, that:
1594  * 1. IOMMU is present and allows to map all the memory.
1595  * 2. No high memory really exists on this machine.
1596  */
1597 
1598 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1599 {
1600 #ifdef CONFIG_HIGHMEM
1601 	int i;
1602 
1603 	if (dev->features & NETIF_F_HIGHDMA)
1604 		return 0;
1605 
1606 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1607 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1608 			return 1;
1609 
1610 #endif
1611 	return 0;
1612 }
1613 
1614 struct dev_gso_cb {
1615 	void (*destructor)(struct sk_buff *skb);
1616 };
1617 
1618 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1619 
1620 static void dev_gso_skb_destructor(struct sk_buff *skb)
1621 {
1622 	struct dev_gso_cb *cb;
1623 
1624 	do {
1625 		struct sk_buff *nskb = skb->next;
1626 
1627 		skb->next = nskb->next;
1628 		nskb->next = NULL;
1629 		kfree_skb(nskb);
1630 	} while (skb->next);
1631 
1632 	cb = DEV_GSO_CB(skb);
1633 	if (cb->destructor)
1634 		cb->destructor(skb);
1635 }
1636 
1637 /**
1638  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1639  *	@skb: buffer to segment
1640  *
1641  *	This function segments the given skb and stores the list of segments
1642  *	in skb->next.
1643  */
1644 static int dev_gso_segment(struct sk_buff *skb)
1645 {
1646 	struct net_device *dev = skb->dev;
1647 	struct sk_buff *segs;
1648 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1649 					 NETIF_F_SG : 0);
1650 
1651 	segs = skb_gso_segment(skb, features);
1652 
1653 	/* Verifying header integrity only. */
1654 	if (!segs)
1655 		return 0;
1656 
1657 	if (IS_ERR(segs))
1658 		return PTR_ERR(segs);
1659 
1660 	skb->next = segs;
1661 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1662 	skb->destructor = dev_gso_skb_destructor;
1663 
1664 	return 0;
1665 }
1666 
1667 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1668 			struct netdev_queue *txq)
1669 {
1670 	const struct net_device_ops *ops = dev->netdev_ops;
1671 
1672 	prefetch(&dev->netdev_ops->ndo_start_xmit);
1673 	if (likely(!skb->next)) {
1674 		if (!list_empty(&ptype_all))
1675 			dev_queue_xmit_nit(skb, dev);
1676 
1677 		if (netif_needs_gso(dev, skb)) {
1678 			if (unlikely(dev_gso_segment(skb)))
1679 				goto out_kfree_skb;
1680 			if (skb->next)
1681 				goto gso;
1682 		}
1683 
1684 		return ops->ndo_start_xmit(skb, dev);
1685 	}
1686 
1687 gso:
1688 	do {
1689 		struct sk_buff *nskb = skb->next;
1690 		int rc;
1691 
1692 		skb->next = nskb->next;
1693 		nskb->next = NULL;
1694 		rc = ops->ndo_start_xmit(nskb, dev);
1695 		if (unlikely(rc)) {
1696 			nskb->next = skb->next;
1697 			skb->next = nskb;
1698 			return rc;
1699 		}
1700 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1701 			return NETDEV_TX_BUSY;
1702 	} while (skb->next);
1703 
1704 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1705 
1706 out_kfree_skb:
1707 	kfree_skb(skb);
1708 	return 0;
1709 }
1710 
1711 static u32 simple_tx_hashrnd;
1712 static int simple_tx_hashrnd_initialized = 0;
1713 
1714 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1715 {
1716 	u32 addr1, addr2, ports;
1717 	u32 hash, ihl;
1718 	u8 ip_proto = 0;
1719 
1720 	if (unlikely(!simple_tx_hashrnd_initialized)) {
1721 		get_random_bytes(&simple_tx_hashrnd, 4);
1722 		simple_tx_hashrnd_initialized = 1;
1723 	}
1724 
1725 	switch (skb->protocol) {
1726 	case htons(ETH_P_IP):
1727 		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1728 			ip_proto = ip_hdr(skb)->protocol;
1729 		addr1 = ip_hdr(skb)->saddr;
1730 		addr2 = ip_hdr(skb)->daddr;
1731 		ihl = ip_hdr(skb)->ihl;
1732 		break;
1733 	case htons(ETH_P_IPV6):
1734 		ip_proto = ipv6_hdr(skb)->nexthdr;
1735 		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1736 		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1737 		ihl = (40 >> 2);
1738 		break;
1739 	default:
1740 		return 0;
1741 	}
1742 
1743 
1744 	switch (ip_proto) {
1745 	case IPPROTO_TCP:
1746 	case IPPROTO_UDP:
1747 	case IPPROTO_DCCP:
1748 	case IPPROTO_ESP:
1749 	case IPPROTO_AH:
1750 	case IPPROTO_SCTP:
1751 	case IPPROTO_UDPLITE:
1752 		ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1753 		break;
1754 
1755 	default:
1756 		ports = 0;
1757 		break;
1758 	}
1759 
1760 	hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1761 
1762 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1763 }
1764 
1765 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1766 					struct sk_buff *skb)
1767 {
1768 	const struct net_device_ops *ops = dev->netdev_ops;
1769 	u16 queue_index = 0;
1770 
1771 	if (ops->ndo_select_queue)
1772 		queue_index = ops->ndo_select_queue(dev, skb);
1773 	else if (dev->real_num_tx_queues > 1)
1774 		queue_index = simple_tx_hash(dev, skb);
1775 
1776 	skb_set_queue_mapping(skb, queue_index);
1777 	return netdev_get_tx_queue(dev, queue_index);
1778 }
1779 
1780 /**
1781  *	dev_queue_xmit - transmit a buffer
1782  *	@skb: buffer to transmit
1783  *
1784  *	Queue a buffer for transmission to a network device. The caller must
1785  *	have set the device and priority and built the buffer before calling
1786  *	this function. The function can be called from an interrupt.
1787  *
1788  *	A negative errno code is returned on a failure. A success does not
1789  *	guarantee the frame will be transmitted as it may be dropped due
1790  *	to congestion or traffic shaping.
1791  *
1792  * -----------------------------------------------------------------------------------
1793  *      I notice this method can also return errors from the queue disciplines,
1794  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1795  *      be positive.
1796  *
1797  *      Regardless of the return value, the skb is consumed, so it is currently
1798  *      difficult to retry a send to this method.  (You can bump the ref count
1799  *      before sending to hold a reference for retry if you are careful.)
1800  *
1801  *      When calling this method, interrupts MUST be enabled.  This is because
1802  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1803  *          --BLG
1804  */
1805 int dev_queue_xmit(struct sk_buff *skb)
1806 {
1807 	struct net_device *dev = skb->dev;
1808 	struct netdev_queue *txq;
1809 	struct Qdisc *q;
1810 	int rc = -ENOMEM;
1811 
1812 	/* GSO will handle the following emulations directly. */
1813 	if (netif_needs_gso(dev, skb))
1814 		goto gso;
1815 
1816 	if (skb_shinfo(skb)->frag_list &&
1817 	    !(dev->features & NETIF_F_FRAGLIST) &&
1818 	    __skb_linearize(skb))
1819 		goto out_kfree_skb;
1820 
1821 	/* Fragmented skb is linearized if device does not support SG,
1822 	 * or if at least one of fragments is in highmem and device
1823 	 * does not support DMA from it.
1824 	 */
1825 	if (skb_shinfo(skb)->nr_frags &&
1826 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1827 	    __skb_linearize(skb))
1828 		goto out_kfree_skb;
1829 
1830 	/* If packet is not checksummed and device does not support
1831 	 * checksumming for this protocol, complete checksumming here.
1832 	 */
1833 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1834 		skb_set_transport_header(skb, skb->csum_start -
1835 					      skb_headroom(skb));
1836 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1837 			goto out_kfree_skb;
1838 	}
1839 
1840 gso:
1841 	/* Disable soft irqs for various locks below. Also
1842 	 * stops preemption for RCU.
1843 	 */
1844 	rcu_read_lock_bh();
1845 
1846 	txq = dev_pick_tx(dev, skb);
1847 	q = rcu_dereference(txq->qdisc);
1848 
1849 #ifdef CONFIG_NET_CLS_ACT
1850 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1851 #endif
1852 	if (q->enqueue) {
1853 		spinlock_t *root_lock = qdisc_lock(q);
1854 
1855 		spin_lock(root_lock);
1856 
1857 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1858 			kfree_skb(skb);
1859 			rc = NET_XMIT_DROP;
1860 		} else {
1861 			rc = qdisc_enqueue_root(skb, q);
1862 			qdisc_run(q);
1863 		}
1864 		spin_unlock(root_lock);
1865 
1866 		goto out;
1867 	}
1868 
1869 	/* The device has no queue. Common case for software devices:
1870 	   loopback, all the sorts of tunnels...
1871 
1872 	   Really, it is unlikely that netif_tx_lock protection is necessary
1873 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1874 	   counters.)
1875 	   However, it is possible, that they rely on protection
1876 	   made by us here.
1877 
1878 	   Check this and shot the lock. It is not prone from deadlocks.
1879 	   Either shot noqueue qdisc, it is even simpler 8)
1880 	 */
1881 	if (dev->flags & IFF_UP) {
1882 		int cpu = smp_processor_id(); /* ok because BHs are off */
1883 
1884 		if (txq->xmit_lock_owner != cpu) {
1885 
1886 			HARD_TX_LOCK(dev, txq, cpu);
1887 
1888 			if (!netif_tx_queue_stopped(txq)) {
1889 				rc = 0;
1890 				if (!dev_hard_start_xmit(skb, dev, txq)) {
1891 					HARD_TX_UNLOCK(dev, txq);
1892 					goto out;
1893 				}
1894 			}
1895 			HARD_TX_UNLOCK(dev, txq);
1896 			if (net_ratelimit())
1897 				printk(KERN_CRIT "Virtual device %s asks to "
1898 				       "queue packet!\n", dev->name);
1899 		} else {
1900 			/* Recursion is detected! It is possible,
1901 			 * unfortunately */
1902 			if (net_ratelimit())
1903 				printk(KERN_CRIT "Dead loop on virtual device "
1904 				       "%s, fix it urgently!\n", dev->name);
1905 		}
1906 	}
1907 
1908 	rc = -ENETDOWN;
1909 	rcu_read_unlock_bh();
1910 
1911 out_kfree_skb:
1912 	kfree_skb(skb);
1913 	return rc;
1914 out:
1915 	rcu_read_unlock_bh();
1916 	return rc;
1917 }
1918 
1919 
1920 /*=======================================================================
1921 			Receiver routines
1922   =======================================================================*/
1923 
1924 int netdev_max_backlog __read_mostly = 1000;
1925 int netdev_budget __read_mostly = 300;
1926 int weight_p __read_mostly = 64;            /* old backlog weight */
1927 
1928 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1929 
1930 
1931 /**
1932  *	netif_rx	-	post buffer to the network code
1933  *	@skb: buffer to post
1934  *
1935  *	This function receives a packet from a device driver and queues it for
1936  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1937  *	may be dropped during processing for congestion control or by the
1938  *	protocol layers.
1939  *
1940  *	return values:
1941  *	NET_RX_SUCCESS	(no congestion)
1942  *	NET_RX_DROP     (packet was dropped)
1943  *
1944  */
1945 
1946 int netif_rx(struct sk_buff *skb)
1947 {
1948 	struct softnet_data *queue;
1949 	unsigned long flags;
1950 
1951 	/* if netpoll wants it, pretend we never saw it */
1952 	if (netpoll_rx(skb))
1953 		return NET_RX_DROP;
1954 
1955 	if (!skb->tstamp.tv64)
1956 		net_timestamp(skb);
1957 
1958 	/*
1959 	 * The code is rearranged so that the path is the most
1960 	 * short when CPU is congested, but is still operating.
1961 	 */
1962 	local_irq_save(flags);
1963 	queue = &__get_cpu_var(softnet_data);
1964 
1965 	__get_cpu_var(netdev_rx_stat).total++;
1966 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1967 		if (queue->input_pkt_queue.qlen) {
1968 enqueue:
1969 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1970 			local_irq_restore(flags);
1971 			return NET_RX_SUCCESS;
1972 		}
1973 
1974 		napi_schedule(&queue->backlog);
1975 		goto enqueue;
1976 	}
1977 
1978 	__get_cpu_var(netdev_rx_stat).dropped++;
1979 	local_irq_restore(flags);
1980 
1981 	kfree_skb(skb);
1982 	return NET_RX_DROP;
1983 }
1984 
1985 int netif_rx_ni(struct sk_buff *skb)
1986 {
1987 	int err;
1988 
1989 	preempt_disable();
1990 	err = netif_rx(skb);
1991 	if (local_softirq_pending())
1992 		do_softirq();
1993 	preempt_enable();
1994 
1995 	return err;
1996 }
1997 
1998 EXPORT_SYMBOL(netif_rx_ni);
1999 
2000 static void net_tx_action(struct softirq_action *h)
2001 {
2002 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2003 
2004 	if (sd->completion_queue) {
2005 		struct sk_buff *clist;
2006 
2007 		local_irq_disable();
2008 		clist = sd->completion_queue;
2009 		sd->completion_queue = NULL;
2010 		local_irq_enable();
2011 
2012 		while (clist) {
2013 			struct sk_buff *skb = clist;
2014 			clist = clist->next;
2015 
2016 			WARN_ON(atomic_read(&skb->users));
2017 			__kfree_skb(skb);
2018 		}
2019 	}
2020 
2021 	if (sd->output_queue) {
2022 		struct Qdisc *head;
2023 
2024 		local_irq_disable();
2025 		head = sd->output_queue;
2026 		sd->output_queue = NULL;
2027 		local_irq_enable();
2028 
2029 		while (head) {
2030 			struct Qdisc *q = head;
2031 			spinlock_t *root_lock;
2032 
2033 			head = head->next_sched;
2034 
2035 			root_lock = qdisc_lock(q);
2036 			if (spin_trylock(root_lock)) {
2037 				smp_mb__before_clear_bit();
2038 				clear_bit(__QDISC_STATE_SCHED,
2039 					  &q->state);
2040 				qdisc_run(q);
2041 				spin_unlock(root_lock);
2042 			} else {
2043 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2044 					      &q->state)) {
2045 					__netif_reschedule(q);
2046 				} else {
2047 					smp_mb__before_clear_bit();
2048 					clear_bit(__QDISC_STATE_SCHED,
2049 						  &q->state);
2050 				}
2051 			}
2052 		}
2053 	}
2054 }
2055 
2056 static inline int deliver_skb(struct sk_buff *skb,
2057 			      struct packet_type *pt_prev,
2058 			      struct net_device *orig_dev)
2059 {
2060 	atomic_inc(&skb->users);
2061 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2062 }
2063 
2064 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2065 /* These hooks defined here for ATM */
2066 struct net_bridge;
2067 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2068 						unsigned char *addr);
2069 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2070 
2071 /*
2072  * If bridge module is loaded call bridging hook.
2073  *  returns NULL if packet was consumed.
2074  */
2075 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2076 					struct sk_buff *skb) __read_mostly;
2077 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2078 					    struct packet_type **pt_prev, int *ret,
2079 					    struct net_device *orig_dev)
2080 {
2081 	struct net_bridge_port *port;
2082 
2083 	if (skb->pkt_type == PACKET_LOOPBACK ||
2084 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2085 		return skb;
2086 
2087 	if (*pt_prev) {
2088 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2089 		*pt_prev = NULL;
2090 	}
2091 
2092 	return br_handle_frame_hook(port, skb);
2093 }
2094 #else
2095 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2096 #endif
2097 
2098 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2099 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2100 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2101 
2102 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2103 					     struct packet_type **pt_prev,
2104 					     int *ret,
2105 					     struct net_device *orig_dev)
2106 {
2107 	if (skb->dev->macvlan_port == NULL)
2108 		return skb;
2109 
2110 	if (*pt_prev) {
2111 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2112 		*pt_prev = NULL;
2113 	}
2114 	return macvlan_handle_frame_hook(skb);
2115 }
2116 #else
2117 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2118 #endif
2119 
2120 #ifdef CONFIG_NET_CLS_ACT
2121 /* TODO: Maybe we should just force sch_ingress to be compiled in
2122  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2123  * a compare and 2 stores extra right now if we dont have it on
2124  * but have CONFIG_NET_CLS_ACT
2125  * NOTE: This doesnt stop any functionality; if you dont have
2126  * the ingress scheduler, you just cant add policies on ingress.
2127  *
2128  */
2129 static int ing_filter(struct sk_buff *skb)
2130 {
2131 	struct net_device *dev = skb->dev;
2132 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2133 	struct netdev_queue *rxq;
2134 	int result = TC_ACT_OK;
2135 	struct Qdisc *q;
2136 
2137 	if (MAX_RED_LOOP < ttl++) {
2138 		printk(KERN_WARNING
2139 		       "Redir loop detected Dropping packet (%d->%d)\n",
2140 		       skb->iif, dev->ifindex);
2141 		return TC_ACT_SHOT;
2142 	}
2143 
2144 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2145 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2146 
2147 	rxq = &dev->rx_queue;
2148 
2149 	q = rxq->qdisc;
2150 	if (q != &noop_qdisc) {
2151 		spin_lock(qdisc_lock(q));
2152 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2153 			result = qdisc_enqueue_root(skb, q);
2154 		spin_unlock(qdisc_lock(q));
2155 	}
2156 
2157 	return result;
2158 }
2159 
2160 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2161 					 struct packet_type **pt_prev,
2162 					 int *ret, struct net_device *orig_dev)
2163 {
2164 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2165 		goto out;
2166 
2167 	if (*pt_prev) {
2168 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2169 		*pt_prev = NULL;
2170 	} else {
2171 		/* Huh? Why does turning on AF_PACKET affect this? */
2172 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2173 	}
2174 
2175 	switch (ing_filter(skb)) {
2176 	case TC_ACT_SHOT:
2177 	case TC_ACT_STOLEN:
2178 		kfree_skb(skb);
2179 		return NULL;
2180 	}
2181 
2182 out:
2183 	skb->tc_verd = 0;
2184 	return skb;
2185 }
2186 #endif
2187 
2188 /*
2189  * 	netif_nit_deliver - deliver received packets to network taps
2190  * 	@skb: buffer
2191  *
2192  * 	This function is used to deliver incoming packets to network
2193  * 	taps. It should be used when the normal netif_receive_skb path
2194  * 	is bypassed, for example because of VLAN acceleration.
2195  */
2196 void netif_nit_deliver(struct sk_buff *skb)
2197 {
2198 	struct packet_type *ptype;
2199 
2200 	if (list_empty(&ptype_all))
2201 		return;
2202 
2203 	skb_reset_network_header(skb);
2204 	skb_reset_transport_header(skb);
2205 	skb->mac_len = skb->network_header - skb->mac_header;
2206 
2207 	rcu_read_lock();
2208 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2209 		if (!ptype->dev || ptype->dev == skb->dev)
2210 			deliver_skb(skb, ptype, skb->dev);
2211 	}
2212 	rcu_read_unlock();
2213 }
2214 
2215 /**
2216  *	netif_receive_skb - process receive buffer from network
2217  *	@skb: buffer to process
2218  *
2219  *	netif_receive_skb() is the main receive data processing function.
2220  *	It always succeeds. The buffer may be dropped during processing
2221  *	for congestion control or by the protocol layers.
2222  *
2223  *	This function may only be called from softirq context and interrupts
2224  *	should be enabled.
2225  *
2226  *	Return values (usually ignored):
2227  *	NET_RX_SUCCESS: no congestion
2228  *	NET_RX_DROP: packet was dropped
2229  */
2230 int netif_receive_skb(struct sk_buff *skb)
2231 {
2232 	struct packet_type *ptype, *pt_prev;
2233 	struct net_device *orig_dev;
2234 	struct net_device *null_or_orig;
2235 	int ret = NET_RX_DROP;
2236 	__be16 type;
2237 
2238 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2239 		return NET_RX_SUCCESS;
2240 
2241 	/* if we've gotten here through NAPI, check netpoll */
2242 	if (netpoll_receive_skb(skb))
2243 		return NET_RX_DROP;
2244 
2245 	if (!skb->tstamp.tv64)
2246 		net_timestamp(skb);
2247 
2248 	if (!skb->iif)
2249 		skb->iif = skb->dev->ifindex;
2250 
2251 	null_or_orig = NULL;
2252 	orig_dev = skb->dev;
2253 	if (orig_dev->master) {
2254 		if (skb_bond_should_drop(skb))
2255 			null_or_orig = orig_dev; /* deliver only exact match */
2256 		else
2257 			skb->dev = orig_dev->master;
2258 	}
2259 
2260 	__get_cpu_var(netdev_rx_stat).total++;
2261 
2262 	skb_reset_network_header(skb);
2263 	skb_reset_transport_header(skb);
2264 	skb->mac_len = skb->network_header - skb->mac_header;
2265 
2266 	pt_prev = NULL;
2267 
2268 	rcu_read_lock();
2269 
2270 #ifdef CONFIG_NET_CLS_ACT
2271 	if (skb->tc_verd & TC_NCLS) {
2272 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2273 		goto ncls;
2274 	}
2275 #endif
2276 
2277 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2278 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2279 		    ptype->dev == orig_dev) {
2280 			if (pt_prev)
2281 				ret = deliver_skb(skb, pt_prev, orig_dev);
2282 			pt_prev = ptype;
2283 		}
2284 	}
2285 
2286 #ifdef CONFIG_NET_CLS_ACT
2287 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2288 	if (!skb)
2289 		goto out;
2290 ncls:
2291 #endif
2292 
2293 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2294 	if (!skb)
2295 		goto out;
2296 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2297 	if (!skb)
2298 		goto out;
2299 
2300 	type = skb->protocol;
2301 	list_for_each_entry_rcu(ptype,
2302 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2303 		if (ptype->type == type &&
2304 		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2305 		     ptype->dev == orig_dev)) {
2306 			if (pt_prev)
2307 				ret = deliver_skb(skb, pt_prev, orig_dev);
2308 			pt_prev = ptype;
2309 		}
2310 	}
2311 
2312 	if (pt_prev) {
2313 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2314 	} else {
2315 		kfree_skb(skb);
2316 		/* Jamal, now you will not able to escape explaining
2317 		 * me how you were going to use this. :-)
2318 		 */
2319 		ret = NET_RX_DROP;
2320 	}
2321 
2322 out:
2323 	rcu_read_unlock();
2324 	return ret;
2325 }
2326 
2327 /* Network device is going away, flush any packets still pending  */
2328 static void flush_backlog(void *arg)
2329 {
2330 	struct net_device *dev = arg;
2331 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2332 	struct sk_buff *skb, *tmp;
2333 
2334 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2335 		if (skb->dev == dev) {
2336 			__skb_unlink(skb, &queue->input_pkt_queue);
2337 			kfree_skb(skb);
2338 		}
2339 }
2340 
2341 static int napi_gro_complete(struct sk_buff *skb)
2342 {
2343 	struct packet_type *ptype;
2344 	__be16 type = skb->protocol;
2345 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2346 	int err = -ENOENT;
2347 
2348 	if (NAPI_GRO_CB(skb)->count == 1)
2349 		goto out;
2350 
2351 	rcu_read_lock();
2352 	list_for_each_entry_rcu(ptype, head, list) {
2353 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2354 			continue;
2355 
2356 		err = ptype->gro_complete(skb);
2357 		break;
2358 	}
2359 	rcu_read_unlock();
2360 
2361 	if (err) {
2362 		WARN_ON(&ptype->list == head);
2363 		kfree_skb(skb);
2364 		return NET_RX_SUCCESS;
2365 	}
2366 
2367 out:
2368 	skb_shinfo(skb)->gso_size = 0;
2369 	__skb_push(skb, -skb_network_offset(skb));
2370 	return netif_receive_skb(skb);
2371 }
2372 
2373 void napi_gro_flush(struct napi_struct *napi)
2374 {
2375 	struct sk_buff *skb, *next;
2376 
2377 	for (skb = napi->gro_list; skb; skb = next) {
2378 		next = skb->next;
2379 		skb->next = NULL;
2380 		napi_gro_complete(skb);
2381 	}
2382 
2383 	napi->gro_list = NULL;
2384 }
2385 EXPORT_SYMBOL(napi_gro_flush);
2386 
2387 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2388 {
2389 	struct sk_buff **pp = NULL;
2390 	struct packet_type *ptype;
2391 	__be16 type = skb->protocol;
2392 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2393 	int count = 0;
2394 	int same_flow;
2395 	int mac_len;
2396 	int free;
2397 
2398 	if (!(skb->dev->features & NETIF_F_GRO))
2399 		goto normal;
2400 
2401 	if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2402 		goto normal;
2403 
2404 	rcu_read_lock();
2405 	list_for_each_entry_rcu(ptype, head, list) {
2406 		struct sk_buff *p;
2407 
2408 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2409 			continue;
2410 
2411 		skb_reset_network_header(skb);
2412 		mac_len = skb->network_header - skb->mac_header;
2413 		skb->mac_len = mac_len;
2414 		NAPI_GRO_CB(skb)->same_flow = 0;
2415 		NAPI_GRO_CB(skb)->flush = 0;
2416 		NAPI_GRO_CB(skb)->free = 0;
2417 
2418 		for (p = napi->gro_list; p; p = p->next) {
2419 			count++;
2420 
2421 			if (!NAPI_GRO_CB(p)->same_flow)
2422 				continue;
2423 
2424 			if (p->mac_len != mac_len ||
2425 			    memcmp(skb_mac_header(p), skb_mac_header(skb),
2426 				   mac_len))
2427 				NAPI_GRO_CB(p)->same_flow = 0;
2428 		}
2429 
2430 		pp = ptype->gro_receive(&napi->gro_list, skb);
2431 		break;
2432 	}
2433 	rcu_read_unlock();
2434 
2435 	if (&ptype->list == head)
2436 		goto normal;
2437 
2438 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2439 	free = NAPI_GRO_CB(skb)->free;
2440 
2441 	if (pp) {
2442 		struct sk_buff *nskb = *pp;
2443 
2444 		*pp = nskb->next;
2445 		nskb->next = NULL;
2446 		napi_gro_complete(nskb);
2447 		count--;
2448 	}
2449 
2450 	if (same_flow)
2451 		goto ok;
2452 
2453 	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
2454 		__skb_push(skb, -skb_network_offset(skb));
2455 		goto normal;
2456 	}
2457 
2458 	NAPI_GRO_CB(skb)->count = 1;
2459 	skb_shinfo(skb)->gso_size = skb->len;
2460 	skb->next = napi->gro_list;
2461 	napi->gro_list = skb;
2462 
2463 ok:
2464 	return free;
2465 
2466 normal:
2467 	return -1;
2468 }
2469 EXPORT_SYMBOL(dev_gro_receive);
2470 
2471 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2472 {
2473 	struct sk_buff *p;
2474 
2475 	for (p = napi->gro_list; p; p = p->next) {
2476 		NAPI_GRO_CB(p)->same_flow = 1;
2477 		NAPI_GRO_CB(p)->flush = 0;
2478 	}
2479 
2480 	return dev_gro_receive(napi, skb);
2481 }
2482 
2483 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2484 {
2485 	if (netpoll_receive_skb(skb))
2486 		return NET_RX_DROP;
2487 
2488 	switch (__napi_gro_receive(napi, skb)) {
2489 	case -1:
2490 		return netif_receive_skb(skb);
2491 
2492 	case 1:
2493 		kfree_skb(skb);
2494 		break;
2495 	}
2496 
2497 	return NET_RX_SUCCESS;
2498 }
2499 EXPORT_SYMBOL(napi_gro_receive);
2500 
2501 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2502 {
2503 	__skb_pull(skb, skb_headlen(skb));
2504 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2505 
2506 	napi->skb = skb;
2507 }
2508 EXPORT_SYMBOL(napi_reuse_skb);
2509 
2510 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2511 				  struct napi_gro_fraginfo *info)
2512 {
2513 	struct net_device *dev = napi->dev;
2514 	struct sk_buff *skb = napi->skb;
2515 
2516 	napi->skb = NULL;
2517 
2518 	if (!skb) {
2519 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2520 		if (!skb)
2521 			goto out;
2522 
2523 		skb_reserve(skb, NET_IP_ALIGN);
2524 	}
2525 
2526 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2527 	skb_shinfo(skb)->nr_frags = info->nr_frags;
2528 	memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
2529 
2530 	skb->data_len = info->len;
2531 	skb->len += info->len;
2532 	skb->truesize += info->len;
2533 
2534 	if (!pskb_may_pull(skb, ETH_HLEN)) {
2535 		napi_reuse_skb(napi, skb);
2536 		skb = NULL;
2537 		goto out;
2538 	}
2539 
2540 	skb->protocol = eth_type_trans(skb, dev);
2541 
2542 	skb->ip_summed = info->ip_summed;
2543 	skb->csum = info->csum;
2544 
2545 out:
2546 	return skb;
2547 }
2548 EXPORT_SYMBOL(napi_fraginfo_skb);
2549 
2550 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2551 {
2552 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2553 	int err = NET_RX_DROP;
2554 
2555 	if (!skb)
2556 		goto out;
2557 
2558 	if (netpoll_receive_skb(skb))
2559 		goto out;
2560 
2561 	err = NET_RX_SUCCESS;
2562 
2563 	switch (__napi_gro_receive(napi, skb)) {
2564 	case -1:
2565 		return netif_receive_skb(skb);
2566 
2567 	case 0:
2568 		goto out;
2569 	}
2570 
2571 	napi_reuse_skb(napi, skb);
2572 
2573 out:
2574 	return err;
2575 }
2576 EXPORT_SYMBOL(napi_gro_frags);
2577 
2578 static int process_backlog(struct napi_struct *napi, int quota)
2579 {
2580 	int work = 0;
2581 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2582 	unsigned long start_time = jiffies;
2583 
2584 	napi->weight = weight_p;
2585 	do {
2586 		struct sk_buff *skb;
2587 
2588 		local_irq_disable();
2589 		skb = __skb_dequeue(&queue->input_pkt_queue);
2590 		if (!skb) {
2591 			local_irq_enable();
2592 			napi_complete(napi);
2593 			goto out;
2594 		}
2595 		local_irq_enable();
2596 
2597 		napi_gro_receive(napi, skb);
2598 	} while (++work < quota && jiffies == start_time);
2599 
2600 	napi_gro_flush(napi);
2601 
2602 out:
2603 	return work;
2604 }
2605 
2606 /**
2607  * __napi_schedule - schedule for receive
2608  * @n: entry to schedule
2609  *
2610  * The entry's receive function will be scheduled to run
2611  */
2612 void __napi_schedule(struct napi_struct *n)
2613 {
2614 	unsigned long flags;
2615 
2616 	local_irq_save(flags);
2617 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2618 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2619 	local_irq_restore(flags);
2620 }
2621 EXPORT_SYMBOL(__napi_schedule);
2622 
2623 void __napi_complete(struct napi_struct *n)
2624 {
2625 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2626 	BUG_ON(n->gro_list);
2627 
2628 	list_del(&n->poll_list);
2629 	smp_mb__before_clear_bit();
2630 	clear_bit(NAPI_STATE_SCHED, &n->state);
2631 }
2632 EXPORT_SYMBOL(__napi_complete);
2633 
2634 void napi_complete(struct napi_struct *n)
2635 {
2636 	unsigned long flags;
2637 
2638 	/*
2639 	 * don't let napi dequeue from the cpu poll list
2640 	 * just in case its running on a different cpu
2641 	 */
2642 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2643 		return;
2644 
2645 	napi_gro_flush(n);
2646 	local_irq_save(flags);
2647 	__napi_complete(n);
2648 	local_irq_restore(flags);
2649 }
2650 EXPORT_SYMBOL(napi_complete);
2651 
2652 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2653 		    int (*poll)(struct napi_struct *, int), int weight)
2654 {
2655 	INIT_LIST_HEAD(&napi->poll_list);
2656 	napi->gro_list = NULL;
2657 	napi->skb = NULL;
2658 	napi->poll = poll;
2659 	napi->weight = weight;
2660 	list_add(&napi->dev_list, &dev->napi_list);
2661 	napi->dev = dev;
2662 #ifdef CONFIG_NETPOLL
2663 	spin_lock_init(&napi->poll_lock);
2664 	napi->poll_owner = -1;
2665 #endif
2666 	set_bit(NAPI_STATE_SCHED, &napi->state);
2667 }
2668 EXPORT_SYMBOL(netif_napi_add);
2669 
2670 void netif_napi_del(struct napi_struct *napi)
2671 {
2672 	struct sk_buff *skb, *next;
2673 
2674 	list_del_init(&napi->dev_list);
2675 	kfree_skb(napi->skb);
2676 
2677 	for (skb = napi->gro_list; skb; skb = next) {
2678 		next = skb->next;
2679 		skb->next = NULL;
2680 		kfree_skb(skb);
2681 	}
2682 
2683 	napi->gro_list = NULL;
2684 }
2685 EXPORT_SYMBOL(netif_napi_del);
2686 
2687 
2688 static void net_rx_action(struct softirq_action *h)
2689 {
2690 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2691 	unsigned long time_limit = jiffies + 2;
2692 	int budget = netdev_budget;
2693 	void *have;
2694 
2695 	local_irq_disable();
2696 
2697 	while (!list_empty(list)) {
2698 		struct napi_struct *n;
2699 		int work, weight;
2700 
2701 		/* If softirq window is exhuasted then punt.
2702 		 * Allow this to run for 2 jiffies since which will allow
2703 		 * an average latency of 1.5/HZ.
2704 		 */
2705 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2706 			goto softnet_break;
2707 
2708 		local_irq_enable();
2709 
2710 		/* Even though interrupts have been re-enabled, this
2711 		 * access is safe because interrupts can only add new
2712 		 * entries to the tail of this list, and only ->poll()
2713 		 * calls can remove this head entry from the list.
2714 		 */
2715 		n = list_entry(list->next, struct napi_struct, poll_list);
2716 
2717 		have = netpoll_poll_lock(n);
2718 
2719 		weight = n->weight;
2720 
2721 		/* This NAPI_STATE_SCHED test is for avoiding a race
2722 		 * with netpoll's poll_napi().  Only the entity which
2723 		 * obtains the lock and sees NAPI_STATE_SCHED set will
2724 		 * actually make the ->poll() call.  Therefore we avoid
2725 		 * accidently calling ->poll() when NAPI is not scheduled.
2726 		 */
2727 		work = 0;
2728 		if (test_bit(NAPI_STATE_SCHED, &n->state))
2729 			work = n->poll(n, weight);
2730 
2731 		WARN_ON_ONCE(work > weight);
2732 
2733 		budget -= work;
2734 
2735 		local_irq_disable();
2736 
2737 		/* Drivers must not modify the NAPI state if they
2738 		 * consume the entire weight.  In such cases this code
2739 		 * still "owns" the NAPI instance and therefore can
2740 		 * move the instance around on the list at-will.
2741 		 */
2742 		if (unlikely(work == weight)) {
2743 			if (unlikely(napi_disable_pending(n)))
2744 				__napi_complete(n);
2745 			else
2746 				list_move_tail(&n->poll_list, list);
2747 		}
2748 
2749 		netpoll_poll_unlock(have);
2750 	}
2751 out:
2752 	local_irq_enable();
2753 
2754 #ifdef CONFIG_NET_DMA
2755 	/*
2756 	 * There may not be any more sk_buffs coming right now, so push
2757 	 * any pending DMA copies to hardware
2758 	 */
2759 	dma_issue_pending_all();
2760 #endif
2761 
2762 	return;
2763 
2764 softnet_break:
2765 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2766 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2767 	goto out;
2768 }
2769 
2770 static gifconf_func_t * gifconf_list [NPROTO];
2771 
2772 /**
2773  *	register_gifconf	-	register a SIOCGIF handler
2774  *	@family: Address family
2775  *	@gifconf: Function handler
2776  *
2777  *	Register protocol dependent address dumping routines. The handler
2778  *	that is passed must not be freed or reused until it has been replaced
2779  *	by another handler.
2780  */
2781 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2782 {
2783 	if (family >= NPROTO)
2784 		return -EINVAL;
2785 	gifconf_list[family] = gifconf;
2786 	return 0;
2787 }
2788 
2789 
2790 /*
2791  *	Map an interface index to its name (SIOCGIFNAME)
2792  */
2793 
2794 /*
2795  *	We need this ioctl for efficient implementation of the
2796  *	if_indextoname() function required by the IPv6 API.  Without
2797  *	it, we would have to search all the interfaces to find a
2798  *	match.  --pb
2799  */
2800 
2801 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2802 {
2803 	struct net_device *dev;
2804 	struct ifreq ifr;
2805 
2806 	/*
2807 	 *	Fetch the caller's info block.
2808 	 */
2809 
2810 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2811 		return -EFAULT;
2812 
2813 	read_lock(&dev_base_lock);
2814 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2815 	if (!dev) {
2816 		read_unlock(&dev_base_lock);
2817 		return -ENODEV;
2818 	}
2819 
2820 	strcpy(ifr.ifr_name, dev->name);
2821 	read_unlock(&dev_base_lock);
2822 
2823 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2824 		return -EFAULT;
2825 	return 0;
2826 }
2827 
2828 /*
2829  *	Perform a SIOCGIFCONF call. This structure will change
2830  *	size eventually, and there is nothing I can do about it.
2831  *	Thus we will need a 'compatibility mode'.
2832  */
2833 
2834 static int dev_ifconf(struct net *net, char __user *arg)
2835 {
2836 	struct ifconf ifc;
2837 	struct net_device *dev;
2838 	char __user *pos;
2839 	int len;
2840 	int total;
2841 	int i;
2842 
2843 	/*
2844 	 *	Fetch the caller's info block.
2845 	 */
2846 
2847 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2848 		return -EFAULT;
2849 
2850 	pos = ifc.ifc_buf;
2851 	len = ifc.ifc_len;
2852 
2853 	/*
2854 	 *	Loop over the interfaces, and write an info block for each.
2855 	 */
2856 
2857 	total = 0;
2858 	for_each_netdev(net, dev) {
2859 		for (i = 0; i < NPROTO; i++) {
2860 			if (gifconf_list[i]) {
2861 				int done;
2862 				if (!pos)
2863 					done = gifconf_list[i](dev, NULL, 0);
2864 				else
2865 					done = gifconf_list[i](dev, pos + total,
2866 							       len - total);
2867 				if (done < 0)
2868 					return -EFAULT;
2869 				total += done;
2870 			}
2871 		}
2872 	}
2873 
2874 	/*
2875 	 *	All done.  Write the updated control block back to the caller.
2876 	 */
2877 	ifc.ifc_len = total;
2878 
2879 	/*
2880 	 * 	Both BSD and Solaris return 0 here, so we do too.
2881 	 */
2882 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2883 }
2884 
2885 #ifdef CONFIG_PROC_FS
2886 /*
2887  *	This is invoked by the /proc filesystem handler to display a device
2888  *	in detail.
2889  */
2890 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2891 	__acquires(dev_base_lock)
2892 {
2893 	struct net *net = seq_file_net(seq);
2894 	loff_t off;
2895 	struct net_device *dev;
2896 
2897 	read_lock(&dev_base_lock);
2898 	if (!*pos)
2899 		return SEQ_START_TOKEN;
2900 
2901 	off = 1;
2902 	for_each_netdev(net, dev)
2903 		if (off++ == *pos)
2904 			return dev;
2905 
2906 	return NULL;
2907 }
2908 
2909 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2910 {
2911 	struct net *net = seq_file_net(seq);
2912 	++*pos;
2913 	return v == SEQ_START_TOKEN ?
2914 		first_net_device(net) : next_net_device((struct net_device *)v);
2915 }
2916 
2917 void dev_seq_stop(struct seq_file *seq, void *v)
2918 	__releases(dev_base_lock)
2919 {
2920 	read_unlock(&dev_base_lock);
2921 }
2922 
2923 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2924 {
2925 	const struct net_device_stats *stats = dev_get_stats(dev);
2926 
2927 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2928 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2929 		   dev->name, stats->rx_bytes, stats->rx_packets,
2930 		   stats->rx_errors,
2931 		   stats->rx_dropped + stats->rx_missed_errors,
2932 		   stats->rx_fifo_errors,
2933 		   stats->rx_length_errors + stats->rx_over_errors +
2934 		    stats->rx_crc_errors + stats->rx_frame_errors,
2935 		   stats->rx_compressed, stats->multicast,
2936 		   stats->tx_bytes, stats->tx_packets,
2937 		   stats->tx_errors, stats->tx_dropped,
2938 		   stats->tx_fifo_errors, stats->collisions,
2939 		   stats->tx_carrier_errors +
2940 		    stats->tx_aborted_errors +
2941 		    stats->tx_window_errors +
2942 		    stats->tx_heartbeat_errors,
2943 		   stats->tx_compressed);
2944 }
2945 
2946 /*
2947  *	Called from the PROCfs module. This now uses the new arbitrary sized
2948  *	/proc/net interface to create /proc/net/dev
2949  */
2950 static int dev_seq_show(struct seq_file *seq, void *v)
2951 {
2952 	if (v == SEQ_START_TOKEN)
2953 		seq_puts(seq, "Inter-|   Receive                            "
2954 			      "                    |  Transmit\n"
2955 			      " face |bytes    packets errs drop fifo frame "
2956 			      "compressed multicast|bytes    packets errs "
2957 			      "drop fifo colls carrier compressed\n");
2958 	else
2959 		dev_seq_printf_stats(seq, v);
2960 	return 0;
2961 }
2962 
2963 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2964 {
2965 	struct netif_rx_stats *rc = NULL;
2966 
2967 	while (*pos < nr_cpu_ids)
2968 		if (cpu_online(*pos)) {
2969 			rc = &per_cpu(netdev_rx_stat, *pos);
2970 			break;
2971 		} else
2972 			++*pos;
2973 	return rc;
2974 }
2975 
2976 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2977 {
2978 	return softnet_get_online(pos);
2979 }
2980 
2981 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2982 {
2983 	++*pos;
2984 	return softnet_get_online(pos);
2985 }
2986 
2987 static void softnet_seq_stop(struct seq_file *seq, void *v)
2988 {
2989 }
2990 
2991 static int softnet_seq_show(struct seq_file *seq, void *v)
2992 {
2993 	struct netif_rx_stats *s = v;
2994 
2995 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2996 		   s->total, s->dropped, s->time_squeeze, 0,
2997 		   0, 0, 0, 0, /* was fastroute */
2998 		   s->cpu_collision );
2999 	return 0;
3000 }
3001 
3002 static const struct seq_operations dev_seq_ops = {
3003 	.start = dev_seq_start,
3004 	.next  = dev_seq_next,
3005 	.stop  = dev_seq_stop,
3006 	.show  = dev_seq_show,
3007 };
3008 
3009 static int dev_seq_open(struct inode *inode, struct file *file)
3010 {
3011 	return seq_open_net(inode, file, &dev_seq_ops,
3012 			    sizeof(struct seq_net_private));
3013 }
3014 
3015 static const struct file_operations dev_seq_fops = {
3016 	.owner	 = THIS_MODULE,
3017 	.open    = dev_seq_open,
3018 	.read    = seq_read,
3019 	.llseek  = seq_lseek,
3020 	.release = seq_release_net,
3021 };
3022 
3023 static const struct seq_operations softnet_seq_ops = {
3024 	.start = softnet_seq_start,
3025 	.next  = softnet_seq_next,
3026 	.stop  = softnet_seq_stop,
3027 	.show  = softnet_seq_show,
3028 };
3029 
3030 static int softnet_seq_open(struct inode *inode, struct file *file)
3031 {
3032 	return seq_open(file, &softnet_seq_ops);
3033 }
3034 
3035 static const struct file_operations softnet_seq_fops = {
3036 	.owner	 = THIS_MODULE,
3037 	.open    = softnet_seq_open,
3038 	.read    = seq_read,
3039 	.llseek  = seq_lseek,
3040 	.release = seq_release,
3041 };
3042 
3043 static void *ptype_get_idx(loff_t pos)
3044 {
3045 	struct packet_type *pt = NULL;
3046 	loff_t i = 0;
3047 	int t;
3048 
3049 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3050 		if (i == pos)
3051 			return pt;
3052 		++i;
3053 	}
3054 
3055 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3056 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3057 			if (i == pos)
3058 				return pt;
3059 			++i;
3060 		}
3061 	}
3062 	return NULL;
3063 }
3064 
3065 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3066 	__acquires(RCU)
3067 {
3068 	rcu_read_lock();
3069 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3070 }
3071 
3072 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3073 {
3074 	struct packet_type *pt;
3075 	struct list_head *nxt;
3076 	int hash;
3077 
3078 	++*pos;
3079 	if (v == SEQ_START_TOKEN)
3080 		return ptype_get_idx(0);
3081 
3082 	pt = v;
3083 	nxt = pt->list.next;
3084 	if (pt->type == htons(ETH_P_ALL)) {
3085 		if (nxt != &ptype_all)
3086 			goto found;
3087 		hash = 0;
3088 		nxt = ptype_base[0].next;
3089 	} else
3090 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3091 
3092 	while (nxt == &ptype_base[hash]) {
3093 		if (++hash >= PTYPE_HASH_SIZE)
3094 			return NULL;
3095 		nxt = ptype_base[hash].next;
3096 	}
3097 found:
3098 	return list_entry(nxt, struct packet_type, list);
3099 }
3100 
3101 static void ptype_seq_stop(struct seq_file *seq, void *v)
3102 	__releases(RCU)
3103 {
3104 	rcu_read_unlock();
3105 }
3106 
3107 static int ptype_seq_show(struct seq_file *seq, void *v)
3108 {
3109 	struct packet_type *pt = v;
3110 
3111 	if (v == SEQ_START_TOKEN)
3112 		seq_puts(seq, "Type Device      Function\n");
3113 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3114 		if (pt->type == htons(ETH_P_ALL))
3115 			seq_puts(seq, "ALL ");
3116 		else
3117 			seq_printf(seq, "%04x", ntohs(pt->type));
3118 
3119 		seq_printf(seq, " %-8s %pF\n",
3120 			   pt->dev ? pt->dev->name : "", pt->func);
3121 	}
3122 
3123 	return 0;
3124 }
3125 
3126 static const struct seq_operations ptype_seq_ops = {
3127 	.start = ptype_seq_start,
3128 	.next  = ptype_seq_next,
3129 	.stop  = ptype_seq_stop,
3130 	.show  = ptype_seq_show,
3131 };
3132 
3133 static int ptype_seq_open(struct inode *inode, struct file *file)
3134 {
3135 	return seq_open_net(inode, file, &ptype_seq_ops,
3136 			sizeof(struct seq_net_private));
3137 }
3138 
3139 static const struct file_operations ptype_seq_fops = {
3140 	.owner	 = THIS_MODULE,
3141 	.open    = ptype_seq_open,
3142 	.read    = seq_read,
3143 	.llseek  = seq_lseek,
3144 	.release = seq_release_net,
3145 };
3146 
3147 
3148 static int __net_init dev_proc_net_init(struct net *net)
3149 {
3150 	int rc = -ENOMEM;
3151 
3152 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3153 		goto out;
3154 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3155 		goto out_dev;
3156 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3157 		goto out_softnet;
3158 
3159 	if (wext_proc_init(net))
3160 		goto out_ptype;
3161 	rc = 0;
3162 out:
3163 	return rc;
3164 out_ptype:
3165 	proc_net_remove(net, "ptype");
3166 out_softnet:
3167 	proc_net_remove(net, "softnet_stat");
3168 out_dev:
3169 	proc_net_remove(net, "dev");
3170 	goto out;
3171 }
3172 
3173 static void __net_exit dev_proc_net_exit(struct net *net)
3174 {
3175 	wext_proc_exit(net);
3176 
3177 	proc_net_remove(net, "ptype");
3178 	proc_net_remove(net, "softnet_stat");
3179 	proc_net_remove(net, "dev");
3180 }
3181 
3182 static struct pernet_operations __net_initdata dev_proc_ops = {
3183 	.init = dev_proc_net_init,
3184 	.exit = dev_proc_net_exit,
3185 };
3186 
3187 static int __init dev_proc_init(void)
3188 {
3189 	return register_pernet_subsys(&dev_proc_ops);
3190 }
3191 #else
3192 #define dev_proc_init() 0
3193 #endif	/* CONFIG_PROC_FS */
3194 
3195 
3196 /**
3197  *	netdev_set_master	-	set up master/slave pair
3198  *	@slave: slave device
3199  *	@master: new master device
3200  *
3201  *	Changes the master device of the slave. Pass %NULL to break the
3202  *	bonding. The caller must hold the RTNL semaphore. On a failure
3203  *	a negative errno code is returned. On success the reference counts
3204  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3205  *	function returns zero.
3206  */
3207 int netdev_set_master(struct net_device *slave, struct net_device *master)
3208 {
3209 	struct net_device *old = slave->master;
3210 
3211 	ASSERT_RTNL();
3212 
3213 	if (master) {
3214 		if (old)
3215 			return -EBUSY;
3216 		dev_hold(master);
3217 	}
3218 
3219 	slave->master = master;
3220 
3221 	synchronize_net();
3222 
3223 	if (old)
3224 		dev_put(old);
3225 
3226 	if (master)
3227 		slave->flags |= IFF_SLAVE;
3228 	else
3229 		slave->flags &= ~IFF_SLAVE;
3230 
3231 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3232 	return 0;
3233 }
3234 
3235 static void dev_change_rx_flags(struct net_device *dev, int flags)
3236 {
3237 	const struct net_device_ops *ops = dev->netdev_ops;
3238 
3239 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3240 		ops->ndo_change_rx_flags(dev, flags);
3241 }
3242 
3243 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3244 {
3245 	unsigned short old_flags = dev->flags;
3246 	uid_t uid;
3247 	gid_t gid;
3248 
3249 	ASSERT_RTNL();
3250 
3251 	dev->flags |= IFF_PROMISC;
3252 	dev->promiscuity += inc;
3253 	if (dev->promiscuity == 0) {
3254 		/*
3255 		 * Avoid overflow.
3256 		 * If inc causes overflow, untouch promisc and return error.
3257 		 */
3258 		if (inc < 0)
3259 			dev->flags &= ~IFF_PROMISC;
3260 		else {
3261 			dev->promiscuity -= inc;
3262 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3263 				"set promiscuity failed, promiscuity feature "
3264 				"of device might be broken.\n", dev->name);
3265 			return -EOVERFLOW;
3266 		}
3267 	}
3268 	if (dev->flags != old_flags) {
3269 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3270 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3271 							       "left");
3272 		if (audit_enabled) {
3273 			current_uid_gid(&uid, &gid);
3274 			audit_log(current->audit_context, GFP_ATOMIC,
3275 				AUDIT_ANOM_PROMISCUOUS,
3276 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3277 				dev->name, (dev->flags & IFF_PROMISC),
3278 				(old_flags & IFF_PROMISC),
3279 				audit_get_loginuid(current),
3280 				uid, gid,
3281 				audit_get_sessionid(current));
3282 		}
3283 
3284 		dev_change_rx_flags(dev, IFF_PROMISC);
3285 	}
3286 	return 0;
3287 }
3288 
3289 /**
3290  *	dev_set_promiscuity	- update promiscuity count on a device
3291  *	@dev: device
3292  *	@inc: modifier
3293  *
3294  *	Add or remove promiscuity from a device. While the count in the device
3295  *	remains above zero the interface remains promiscuous. Once it hits zero
3296  *	the device reverts back to normal filtering operation. A negative inc
3297  *	value is used to drop promiscuity on the device.
3298  *	Return 0 if successful or a negative errno code on error.
3299  */
3300 int dev_set_promiscuity(struct net_device *dev, int inc)
3301 {
3302 	unsigned short old_flags = dev->flags;
3303 	int err;
3304 
3305 	err = __dev_set_promiscuity(dev, inc);
3306 	if (err < 0)
3307 		return err;
3308 	if (dev->flags != old_flags)
3309 		dev_set_rx_mode(dev);
3310 	return err;
3311 }
3312 
3313 /**
3314  *	dev_set_allmulti	- update allmulti count on a device
3315  *	@dev: device
3316  *	@inc: modifier
3317  *
3318  *	Add or remove reception of all multicast frames to a device. While the
3319  *	count in the device remains above zero the interface remains listening
3320  *	to all interfaces. Once it hits zero the device reverts back to normal
3321  *	filtering operation. A negative @inc value is used to drop the counter
3322  *	when releasing a resource needing all multicasts.
3323  *	Return 0 if successful or a negative errno code on error.
3324  */
3325 
3326 int dev_set_allmulti(struct net_device *dev, int inc)
3327 {
3328 	unsigned short old_flags = dev->flags;
3329 
3330 	ASSERT_RTNL();
3331 
3332 	dev->flags |= IFF_ALLMULTI;
3333 	dev->allmulti += inc;
3334 	if (dev->allmulti == 0) {
3335 		/*
3336 		 * Avoid overflow.
3337 		 * If inc causes overflow, untouch allmulti and return error.
3338 		 */
3339 		if (inc < 0)
3340 			dev->flags &= ~IFF_ALLMULTI;
3341 		else {
3342 			dev->allmulti -= inc;
3343 			printk(KERN_WARNING "%s: allmulti touches roof, "
3344 				"set allmulti failed, allmulti feature of "
3345 				"device might be broken.\n", dev->name);
3346 			return -EOVERFLOW;
3347 		}
3348 	}
3349 	if (dev->flags ^ old_flags) {
3350 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3351 		dev_set_rx_mode(dev);
3352 	}
3353 	return 0;
3354 }
3355 
3356 /*
3357  *	Upload unicast and multicast address lists to device and
3358  *	configure RX filtering. When the device doesn't support unicast
3359  *	filtering it is put in promiscuous mode while unicast addresses
3360  *	are present.
3361  */
3362 void __dev_set_rx_mode(struct net_device *dev)
3363 {
3364 	const struct net_device_ops *ops = dev->netdev_ops;
3365 
3366 	/* dev_open will call this function so the list will stay sane. */
3367 	if (!(dev->flags&IFF_UP))
3368 		return;
3369 
3370 	if (!netif_device_present(dev))
3371 		return;
3372 
3373 	if (ops->ndo_set_rx_mode)
3374 		ops->ndo_set_rx_mode(dev);
3375 	else {
3376 		/* Unicast addresses changes may only happen under the rtnl,
3377 		 * therefore calling __dev_set_promiscuity here is safe.
3378 		 */
3379 		if (dev->uc_count > 0 && !dev->uc_promisc) {
3380 			__dev_set_promiscuity(dev, 1);
3381 			dev->uc_promisc = 1;
3382 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3383 			__dev_set_promiscuity(dev, -1);
3384 			dev->uc_promisc = 0;
3385 		}
3386 
3387 		if (ops->ndo_set_multicast_list)
3388 			ops->ndo_set_multicast_list(dev);
3389 	}
3390 }
3391 
3392 void dev_set_rx_mode(struct net_device *dev)
3393 {
3394 	netif_addr_lock_bh(dev);
3395 	__dev_set_rx_mode(dev);
3396 	netif_addr_unlock_bh(dev);
3397 }
3398 
3399 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3400 		      void *addr, int alen, int glbl)
3401 {
3402 	struct dev_addr_list *da;
3403 
3404 	for (; (da = *list) != NULL; list = &da->next) {
3405 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3406 		    alen == da->da_addrlen) {
3407 			if (glbl) {
3408 				int old_glbl = da->da_gusers;
3409 				da->da_gusers = 0;
3410 				if (old_glbl == 0)
3411 					break;
3412 			}
3413 			if (--da->da_users)
3414 				return 0;
3415 
3416 			*list = da->next;
3417 			kfree(da);
3418 			(*count)--;
3419 			return 0;
3420 		}
3421 	}
3422 	return -ENOENT;
3423 }
3424 
3425 int __dev_addr_add(struct dev_addr_list **list, int *count,
3426 		   void *addr, int alen, int glbl)
3427 {
3428 	struct dev_addr_list *da;
3429 
3430 	for (da = *list; da != NULL; da = da->next) {
3431 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3432 		    da->da_addrlen == alen) {
3433 			if (glbl) {
3434 				int old_glbl = da->da_gusers;
3435 				da->da_gusers = 1;
3436 				if (old_glbl)
3437 					return 0;
3438 			}
3439 			da->da_users++;
3440 			return 0;
3441 		}
3442 	}
3443 
3444 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3445 	if (da == NULL)
3446 		return -ENOMEM;
3447 	memcpy(da->da_addr, addr, alen);
3448 	da->da_addrlen = alen;
3449 	da->da_users = 1;
3450 	da->da_gusers = glbl ? 1 : 0;
3451 	da->next = *list;
3452 	*list = da;
3453 	(*count)++;
3454 	return 0;
3455 }
3456 
3457 /**
3458  *	dev_unicast_delete	- Release secondary unicast address.
3459  *	@dev: device
3460  *	@addr: address to delete
3461  *	@alen: length of @addr
3462  *
3463  *	Release reference to a secondary unicast address and remove it
3464  *	from the device if the reference count drops to zero.
3465  *
3466  * 	The caller must hold the rtnl_mutex.
3467  */
3468 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3469 {
3470 	int err;
3471 
3472 	ASSERT_RTNL();
3473 
3474 	netif_addr_lock_bh(dev);
3475 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3476 	if (!err)
3477 		__dev_set_rx_mode(dev);
3478 	netif_addr_unlock_bh(dev);
3479 	return err;
3480 }
3481 EXPORT_SYMBOL(dev_unicast_delete);
3482 
3483 /**
3484  *	dev_unicast_add		- add a secondary unicast address
3485  *	@dev: device
3486  *	@addr: address to add
3487  *	@alen: length of @addr
3488  *
3489  *	Add a secondary unicast address to the device or increase
3490  *	the reference count if it already exists.
3491  *
3492  *	The caller must hold the rtnl_mutex.
3493  */
3494 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3495 {
3496 	int err;
3497 
3498 	ASSERT_RTNL();
3499 
3500 	netif_addr_lock_bh(dev);
3501 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3502 	if (!err)
3503 		__dev_set_rx_mode(dev);
3504 	netif_addr_unlock_bh(dev);
3505 	return err;
3506 }
3507 EXPORT_SYMBOL(dev_unicast_add);
3508 
3509 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3510 		    struct dev_addr_list **from, int *from_count)
3511 {
3512 	struct dev_addr_list *da, *next;
3513 	int err = 0;
3514 
3515 	da = *from;
3516 	while (da != NULL) {
3517 		next = da->next;
3518 		if (!da->da_synced) {
3519 			err = __dev_addr_add(to, to_count,
3520 					     da->da_addr, da->da_addrlen, 0);
3521 			if (err < 0)
3522 				break;
3523 			da->da_synced = 1;
3524 			da->da_users++;
3525 		} else if (da->da_users == 1) {
3526 			__dev_addr_delete(to, to_count,
3527 					  da->da_addr, da->da_addrlen, 0);
3528 			__dev_addr_delete(from, from_count,
3529 					  da->da_addr, da->da_addrlen, 0);
3530 		}
3531 		da = next;
3532 	}
3533 	return err;
3534 }
3535 
3536 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3537 		       struct dev_addr_list **from, int *from_count)
3538 {
3539 	struct dev_addr_list *da, *next;
3540 
3541 	da = *from;
3542 	while (da != NULL) {
3543 		next = da->next;
3544 		if (da->da_synced) {
3545 			__dev_addr_delete(to, to_count,
3546 					  da->da_addr, da->da_addrlen, 0);
3547 			da->da_synced = 0;
3548 			__dev_addr_delete(from, from_count,
3549 					  da->da_addr, da->da_addrlen, 0);
3550 		}
3551 		da = next;
3552 	}
3553 }
3554 
3555 /**
3556  *	dev_unicast_sync - Synchronize device's unicast list to another device
3557  *	@to: destination device
3558  *	@from: source device
3559  *
3560  *	Add newly added addresses to the destination device and release
3561  *	addresses that have no users left. The source device must be
3562  *	locked by netif_tx_lock_bh.
3563  *
3564  *	This function is intended to be called from the dev->set_rx_mode
3565  *	function of layered software devices.
3566  */
3567 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3568 {
3569 	int err = 0;
3570 
3571 	netif_addr_lock_bh(to);
3572 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3573 			      &from->uc_list, &from->uc_count);
3574 	if (!err)
3575 		__dev_set_rx_mode(to);
3576 	netif_addr_unlock_bh(to);
3577 	return err;
3578 }
3579 EXPORT_SYMBOL(dev_unicast_sync);
3580 
3581 /**
3582  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3583  *	@to: destination device
3584  *	@from: source device
3585  *
3586  *	Remove all addresses that were added to the destination device by
3587  *	dev_unicast_sync(). This function is intended to be called from the
3588  *	dev->stop function of layered software devices.
3589  */
3590 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3591 {
3592 	netif_addr_lock_bh(from);
3593 	netif_addr_lock(to);
3594 
3595 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3596 			  &from->uc_list, &from->uc_count);
3597 	__dev_set_rx_mode(to);
3598 
3599 	netif_addr_unlock(to);
3600 	netif_addr_unlock_bh(from);
3601 }
3602 EXPORT_SYMBOL(dev_unicast_unsync);
3603 
3604 static void __dev_addr_discard(struct dev_addr_list **list)
3605 {
3606 	struct dev_addr_list *tmp;
3607 
3608 	while (*list != NULL) {
3609 		tmp = *list;
3610 		*list = tmp->next;
3611 		if (tmp->da_users > tmp->da_gusers)
3612 			printk("__dev_addr_discard: address leakage! "
3613 			       "da_users=%d\n", tmp->da_users);
3614 		kfree(tmp);
3615 	}
3616 }
3617 
3618 static void dev_addr_discard(struct net_device *dev)
3619 {
3620 	netif_addr_lock_bh(dev);
3621 
3622 	__dev_addr_discard(&dev->uc_list);
3623 	dev->uc_count = 0;
3624 
3625 	__dev_addr_discard(&dev->mc_list);
3626 	dev->mc_count = 0;
3627 
3628 	netif_addr_unlock_bh(dev);
3629 }
3630 
3631 /**
3632  *	dev_get_flags - get flags reported to userspace
3633  *	@dev: device
3634  *
3635  *	Get the combination of flag bits exported through APIs to userspace.
3636  */
3637 unsigned dev_get_flags(const struct net_device *dev)
3638 {
3639 	unsigned flags;
3640 
3641 	flags = (dev->flags & ~(IFF_PROMISC |
3642 				IFF_ALLMULTI |
3643 				IFF_RUNNING |
3644 				IFF_LOWER_UP |
3645 				IFF_DORMANT)) |
3646 		(dev->gflags & (IFF_PROMISC |
3647 				IFF_ALLMULTI));
3648 
3649 	if (netif_running(dev)) {
3650 		if (netif_oper_up(dev))
3651 			flags |= IFF_RUNNING;
3652 		if (netif_carrier_ok(dev))
3653 			flags |= IFF_LOWER_UP;
3654 		if (netif_dormant(dev))
3655 			flags |= IFF_DORMANT;
3656 	}
3657 
3658 	return flags;
3659 }
3660 
3661 /**
3662  *	dev_change_flags - change device settings
3663  *	@dev: device
3664  *	@flags: device state flags
3665  *
3666  *	Change settings on device based state flags. The flags are
3667  *	in the userspace exported format.
3668  */
3669 int dev_change_flags(struct net_device *dev, unsigned flags)
3670 {
3671 	int ret, changes;
3672 	int old_flags = dev->flags;
3673 
3674 	ASSERT_RTNL();
3675 
3676 	/*
3677 	 *	Set the flags on our device.
3678 	 */
3679 
3680 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3681 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3682 			       IFF_AUTOMEDIA)) |
3683 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3684 				    IFF_ALLMULTI));
3685 
3686 	/*
3687 	 *	Load in the correct multicast list now the flags have changed.
3688 	 */
3689 
3690 	if ((old_flags ^ flags) & IFF_MULTICAST)
3691 		dev_change_rx_flags(dev, IFF_MULTICAST);
3692 
3693 	dev_set_rx_mode(dev);
3694 
3695 	/*
3696 	 *	Have we downed the interface. We handle IFF_UP ourselves
3697 	 *	according to user attempts to set it, rather than blindly
3698 	 *	setting it.
3699 	 */
3700 
3701 	ret = 0;
3702 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3703 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3704 
3705 		if (!ret)
3706 			dev_set_rx_mode(dev);
3707 	}
3708 
3709 	if (dev->flags & IFF_UP &&
3710 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3711 					  IFF_VOLATILE)))
3712 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3713 
3714 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3715 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3716 		dev->gflags ^= IFF_PROMISC;
3717 		dev_set_promiscuity(dev, inc);
3718 	}
3719 
3720 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3721 	   is important. Some (broken) drivers set IFF_PROMISC, when
3722 	   IFF_ALLMULTI is requested not asking us and not reporting.
3723 	 */
3724 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3725 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3726 		dev->gflags ^= IFF_ALLMULTI;
3727 		dev_set_allmulti(dev, inc);
3728 	}
3729 
3730 	/* Exclude state transition flags, already notified */
3731 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3732 	if (changes)
3733 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3734 
3735 	return ret;
3736 }
3737 
3738 /**
3739  *	dev_set_mtu - Change maximum transfer unit
3740  *	@dev: device
3741  *	@new_mtu: new transfer unit
3742  *
3743  *	Change the maximum transfer size of the network device.
3744  */
3745 int dev_set_mtu(struct net_device *dev, int new_mtu)
3746 {
3747 	const struct net_device_ops *ops = dev->netdev_ops;
3748 	int err;
3749 
3750 	if (new_mtu == dev->mtu)
3751 		return 0;
3752 
3753 	/*	MTU must be positive.	 */
3754 	if (new_mtu < 0)
3755 		return -EINVAL;
3756 
3757 	if (!netif_device_present(dev))
3758 		return -ENODEV;
3759 
3760 	err = 0;
3761 	if (ops->ndo_change_mtu)
3762 		err = ops->ndo_change_mtu(dev, new_mtu);
3763 	else
3764 		dev->mtu = new_mtu;
3765 
3766 	if (!err && dev->flags & IFF_UP)
3767 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3768 	return err;
3769 }
3770 
3771 /**
3772  *	dev_set_mac_address - Change Media Access Control Address
3773  *	@dev: device
3774  *	@sa: new address
3775  *
3776  *	Change the hardware (MAC) address of the device
3777  */
3778 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3779 {
3780 	const struct net_device_ops *ops = dev->netdev_ops;
3781 	int err;
3782 
3783 	if (!ops->ndo_set_mac_address)
3784 		return -EOPNOTSUPP;
3785 	if (sa->sa_family != dev->type)
3786 		return -EINVAL;
3787 	if (!netif_device_present(dev))
3788 		return -ENODEV;
3789 	err = ops->ndo_set_mac_address(dev, sa);
3790 	if (!err)
3791 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3792 	return err;
3793 }
3794 
3795 /*
3796  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3797  */
3798 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3799 {
3800 	int err;
3801 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3802 
3803 	if (!dev)
3804 		return -ENODEV;
3805 
3806 	switch (cmd) {
3807 		case SIOCGIFFLAGS:	/* Get interface flags */
3808 			ifr->ifr_flags = dev_get_flags(dev);
3809 			return 0;
3810 
3811 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3812 					   (currently unused) */
3813 			ifr->ifr_metric = 0;
3814 			return 0;
3815 
3816 		case SIOCGIFMTU:	/* Get the MTU of a device */
3817 			ifr->ifr_mtu = dev->mtu;
3818 			return 0;
3819 
3820 		case SIOCGIFHWADDR:
3821 			if (!dev->addr_len)
3822 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3823 			else
3824 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3825 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3826 			ifr->ifr_hwaddr.sa_family = dev->type;
3827 			return 0;
3828 
3829 		case SIOCGIFSLAVE:
3830 			err = -EINVAL;
3831 			break;
3832 
3833 		case SIOCGIFMAP:
3834 			ifr->ifr_map.mem_start = dev->mem_start;
3835 			ifr->ifr_map.mem_end   = dev->mem_end;
3836 			ifr->ifr_map.base_addr = dev->base_addr;
3837 			ifr->ifr_map.irq       = dev->irq;
3838 			ifr->ifr_map.dma       = dev->dma;
3839 			ifr->ifr_map.port      = dev->if_port;
3840 			return 0;
3841 
3842 		case SIOCGIFINDEX:
3843 			ifr->ifr_ifindex = dev->ifindex;
3844 			return 0;
3845 
3846 		case SIOCGIFTXQLEN:
3847 			ifr->ifr_qlen = dev->tx_queue_len;
3848 			return 0;
3849 
3850 		default:
3851 			/* dev_ioctl() should ensure this case
3852 			 * is never reached
3853 			 */
3854 			WARN_ON(1);
3855 			err = -EINVAL;
3856 			break;
3857 
3858 	}
3859 	return err;
3860 }
3861 
3862 /*
3863  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3864  */
3865 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3866 {
3867 	int err;
3868 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3869 	const struct net_device_ops *ops;
3870 
3871 	if (!dev)
3872 		return -ENODEV;
3873 
3874 	ops = dev->netdev_ops;
3875 
3876 	switch (cmd) {
3877 		case SIOCSIFFLAGS:	/* Set interface flags */
3878 			return dev_change_flags(dev, ifr->ifr_flags);
3879 
3880 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3881 					   (currently unused) */
3882 			return -EOPNOTSUPP;
3883 
3884 		case SIOCSIFMTU:	/* Set the MTU of a device */
3885 			return dev_set_mtu(dev, ifr->ifr_mtu);
3886 
3887 		case SIOCSIFHWADDR:
3888 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3889 
3890 		case SIOCSIFHWBROADCAST:
3891 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3892 				return -EINVAL;
3893 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3894 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3895 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3896 			return 0;
3897 
3898 		case SIOCSIFMAP:
3899 			if (ops->ndo_set_config) {
3900 				if (!netif_device_present(dev))
3901 					return -ENODEV;
3902 				return ops->ndo_set_config(dev, &ifr->ifr_map);
3903 			}
3904 			return -EOPNOTSUPP;
3905 
3906 		case SIOCADDMULTI:
3907 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3908 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3909 				return -EINVAL;
3910 			if (!netif_device_present(dev))
3911 				return -ENODEV;
3912 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3913 					  dev->addr_len, 1);
3914 
3915 		case SIOCDELMULTI:
3916 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3917 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3918 				return -EINVAL;
3919 			if (!netif_device_present(dev))
3920 				return -ENODEV;
3921 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3922 					     dev->addr_len, 1);
3923 
3924 		case SIOCSIFTXQLEN:
3925 			if (ifr->ifr_qlen < 0)
3926 				return -EINVAL;
3927 			dev->tx_queue_len = ifr->ifr_qlen;
3928 			return 0;
3929 
3930 		case SIOCSIFNAME:
3931 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3932 			return dev_change_name(dev, ifr->ifr_newname);
3933 
3934 		/*
3935 		 *	Unknown or private ioctl
3936 		 */
3937 
3938 		default:
3939 			if ((cmd >= SIOCDEVPRIVATE &&
3940 			    cmd <= SIOCDEVPRIVATE + 15) ||
3941 			    cmd == SIOCBONDENSLAVE ||
3942 			    cmd == SIOCBONDRELEASE ||
3943 			    cmd == SIOCBONDSETHWADDR ||
3944 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3945 			    cmd == SIOCBONDINFOQUERY ||
3946 			    cmd == SIOCBONDCHANGEACTIVE ||
3947 			    cmd == SIOCGMIIPHY ||
3948 			    cmd == SIOCGMIIREG ||
3949 			    cmd == SIOCSMIIREG ||
3950 			    cmd == SIOCBRADDIF ||
3951 			    cmd == SIOCBRDELIF ||
3952 			    cmd == SIOCWANDEV) {
3953 				err = -EOPNOTSUPP;
3954 				if (ops->ndo_do_ioctl) {
3955 					if (netif_device_present(dev))
3956 						err = ops->ndo_do_ioctl(dev, ifr, cmd);
3957 					else
3958 						err = -ENODEV;
3959 				}
3960 			} else
3961 				err = -EINVAL;
3962 
3963 	}
3964 	return err;
3965 }
3966 
3967 /*
3968  *	This function handles all "interface"-type I/O control requests. The actual
3969  *	'doing' part of this is dev_ifsioc above.
3970  */
3971 
3972 /**
3973  *	dev_ioctl	-	network device ioctl
3974  *	@net: the applicable net namespace
3975  *	@cmd: command to issue
3976  *	@arg: pointer to a struct ifreq in user space
3977  *
3978  *	Issue ioctl functions to devices. This is normally called by the
3979  *	user space syscall interfaces but can sometimes be useful for
3980  *	other purposes. The return value is the return from the syscall if
3981  *	positive or a negative errno code on error.
3982  */
3983 
3984 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3985 {
3986 	struct ifreq ifr;
3987 	int ret;
3988 	char *colon;
3989 
3990 	/* One special case: SIOCGIFCONF takes ifconf argument
3991 	   and requires shared lock, because it sleeps writing
3992 	   to user space.
3993 	 */
3994 
3995 	if (cmd == SIOCGIFCONF) {
3996 		rtnl_lock();
3997 		ret = dev_ifconf(net, (char __user *) arg);
3998 		rtnl_unlock();
3999 		return ret;
4000 	}
4001 	if (cmd == SIOCGIFNAME)
4002 		return dev_ifname(net, (struct ifreq __user *)arg);
4003 
4004 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4005 		return -EFAULT;
4006 
4007 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4008 
4009 	colon = strchr(ifr.ifr_name, ':');
4010 	if (colon)
4011 		*colon = 0;
4012 
4013 	/*
4014 	 *	See which interface the caller is talking about.
4015 	 */
4016 
4017 	switch (cmd) {
4018 		/*
4019 		 *	These ioctl calls:
4020 		 *	- can be done by all.
4021 		 *	- atomic and do not require locking.
4022 		 *	- return a value
4023 		 */
4024 		case SIOCGIFFLAGS:
4025 		case SIOCGIFMETRIC:
4026 		case SIOCGIFMTU:
4027 		case SIOCGIFHWADDR:
4028 		case SIOCGIFSLAVE:
4029 		case SIOCGIFMAP:
4030 		case SIOCGIFINDEX:
4031 		case SIOCGIFTXQLEN:
4032 			dev_load(net, ifr.ifr_name);
4033 			read_lock(&dev_base_lock);
4034 			ret = dev_ifsioc_locked(net, &ifr, cmd);
4035 			read_unlock(&dev_base_lock);
4036 			if (!ret) {
4037 				if (colon)
4038 					*colon = ':';
4039 				if (copy_to_user(arg, &ifr,
4040 						 sizeof(struct ifreq)))
4041 					ret = -EFAULT;
4042 			}
4043 			return ret;
4044 
4045 		case SIOCETHTOOL:
4046 			dev_load(net, ifr.ifr_name);
4047 			rtnl_lock();
4048 			ret = dev_ethtool(net, &ifr);
4049 			rtnl_unlock();
4050 			if (!ret) {
4051 				if (colon)
4052 					*colon = ':';
4053 				if (copy_to_user(arg, &ifr,
4054 						 sizeof(struct ifreq)))
4055 					ret = -EFAULT;
4056 			}
4057 			return ret;
4058 
4059 		/*
4060 		 *	These ioctl calls:
4061 		 *	- require superuser power.
4062 		 *	- require strict serialization.
4063 		 *	- return a value
4064 		 */
4065 		case SIOCGMIIPHY:
4066 		case SIOCGMIIREG:
4067 		case SIOCSIFNAME:
4068 			if (!capable(CAP_NET_ADMIN))
4069 				return -EPERM;
4070 			dev_load(net, ifr.ifr_name);
4071 			rtnl_lock();
4072 			ret = dev_ifsioc(net, &ifr, cmd);
4073 			rtnl_unlock();
4074 			if (!ret) {
4075 				if (colon)
4076 					*colon = ':';
4077 				if (copy_to_user(arg, &ifr,
4078 						 sizeof(struct ifreq)))
4079 					ret = -EFAULT;
4080 			}
4081 			return ret;
4082 
4083 		/*
4084 		 *	These ioctl calls:
4085 		 *	- require superuser power.
4086 		 *	- require strict serialization.
4087 		 *	- do not return a value
4088 		 */
4089 		case SIOCSIFFLAGS:
4090 		case SIOCSIFMETRIC:
4091 		case SIOCSIFMTU:
4092 		case SIOCSIFMAP:
4093 		case SIOCSIFHWADDR:
4094 		case SIOCSIFSLAVE:
4095 		case SIOCADDMULTI:
4096 		case SIOCDELMULTI:
4097 		case SIOCSIFHWBROADCAST:
4098 		case SIOCSIFTXQLEN:
4099 		case SIOCSMIIREG:
4100 		case SIOCBONDENSLAVE:
4101 		case SIOCBONDRELEASE:
4102 		case SIOCBONDSETHWADDR:
4103 		case SIOCBONDCHANGEACTIVE:
4104 		case SIOCBRADDIF:
4105 		case SIOCBRDELIF:
4106 			if (!capable(CAP_NET_ADMIN))
4107 				return -EPERM;
4108 			/* fall through */
4109 		case SIOCBONDSLAVEINFOQUERY:
4110 		case SIOCBONDINFOQUERY:
4111 			dev_load(net, ifr.ifr_name);
4112 			rtnl_lock();
4113 			ret = dev_ifsioc(net, &ifr, cmd);
4114 			rtnl_unlock();
4115 			return ret;
4116 
4117 		case SIOCGIFMEM:
4118 			/* Get the per device memory space. We can add this but
4119 			 * currently do not support it */
4120 		case SIOCSIFMEM:
4121 			/* Set the per device memory buffer space.
4122 			 * Not applicable in our case */
4123 		case SIOCSIFLINK:
4124 			return -EINVAL;
4125 
4126 		/*
4127 		 *	Unknown or private ioctl.
4128 		 */
4129 		default:
4130 			if (cmd == SIOCWANDEV ||
4131 			    (cmd >= SIOCDEVPRIVATE &&
4132 			     cmd <= SIOCDEVPRIVATE + 15)) {
4133 				dev_load(net, ifr.ifr_name);
4134 				rtnl_lock();
4135 				ret = dev_ifsioc(net, &ifr, cmd);
4136 				rtnl_unlock();
4137 				if (!ret && copy_to_user(arg, &ifr,
4138 							 sizeof(struct ifreq)))
4139 					ret = -EFAULT;
4140 				return ret;
4141 			}
4142 			/* Take care of Wireless Extensions */
4143 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4144 				return wext_handle_ioctl(net, &ifr, cmd, arg);
4145 			return -EINVAL;
4146 	}
4147 }
4148 
4149 
4150 /**
4151  *	dev_new_index	-	allocate an ifindex
4152  *	@net: the applicable net namespace
4153  *
4154  *	Returns a suitable unique value for a new device interface
4155  *	number.  The caller must hold the rtnl semaphore or the
4156  *	dev_base_lock to be sure it remains unique.
4157  */
4158 static int dev_new_index(struct net *net)
4159 {
4160 	static int ifindex;
4161 	for (;;) {
4162 		if (++ifindex <= 0)
4163 			ifindex = 1;
4164 		if (!__dev_get_by_index(net, ifindex))
4165 			return ifindex;
4166 	}
4167 }
4168 
4169 /* Delayed registration/unregisteration */
4170 static LIST_HEAD(net_todo_list);
4171 
4172 static void net_set_todo(struct net_device *dev)
4173 {
4174 	list_add_tail(&dev->todo_list, &net_todo_list);
4175 }
4176 
4177 static void rollback_registered(struct net_device *dev)
4178 {
4179 	BUG_ON(dev_boot_phase);
4180 	ASSERT_RTNL();
4181 
4182 	/* Some devices call without registering for initialization unwind. */
4183 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4184 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4185 				  "was registered\n", dev->name, dev);
4186 
4187 		WARN_ON(1);
4188 		return;
4189 	}
4190 
4191 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4192 
4193 	/* If device is running, close it first. */
4194 	dev_close(dev);
4195 
4196 	/* And unlink it from device chain. */
4197 	unlist_netdevice(dev);
4198 
4199 	dev->reg_state = NETREG_UNREGISTERING;
4200 
4201 	synchronize_net();
4202 
4203 	/* Shutdown queueing discipline. */
4204 	dev_shutdown(dev);
4205 
4206 
4207 	/* Notify protocols, that we are about to destroy
4208 	   this device. They should clean all the things.
4209 	*/
4210 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4211 
4212 	/*
4213 	 *	Flush the unicast and multicast chains
4214 	 */
4215 	dev_addr_discard(dev);
4216 
4217 	if (dev->netdev_ops->ndo_uninit)
4218 		dev->netdev_ops->ndo_uninit(dev);
4219 
4220 	/* Notifier chain MUST detach us from master device. */
4221 	WARN_ON(dev->master);
4222 
4223 	/* Remove entries from kobject tree */
4224 	netdev_unregister_kobject(dev);
4225 
4226 	synchronize_net();
4227 
4228 	dev_put(dev);
4229 }
4230 
4231 static void __netdev_init_queue_locks_one(struct net_device *dev,
4232 					  struct netdev_queue *dev_queue,
4233 					  void *_unused)
4234 {
4235 	spin_lock_init(&dev_queue->_xmit_lock);
4236 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4237 	dev_queue->xmit_lock_owner = -1;
4238 }
4239 
4240 static void netdev_init_queue_locks(struct net_device *dev)
4241 {
4242 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4243 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4244 }
4245 
4246 unsigned long netdev_fix_features(unsigned long features, const char *name)
4247 {
4248 	/* Fix illegal SG+CSUM combinations. */
4249 	if ((features & NETIF_F_SG) &&
4250 	    !(features & NETIF_F_ALL_CSUM)) {
4251 		if (name)
4252 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4253 			       "checksum feature.\n", name);
4254 		features &= ~NETIF_F_SG;
4255 	}
4256 
4257 	/* TSO requires that SG is present as well. */
4258 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4259 		if (name)
4260 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4261 			       "SG feature.\n", name);
4262 		features &= ~NETIF_F_TSO;
4263 	}
4264 
4265 	if (features & NETIF_F_UFO) {
4266 		if (!(features & NETIF_F_GEN_CSUM)) {
4267 			if (name)
4268 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4269 				       "since no NETIF_F_HW_CSUM feature.\n",
4270 				       name);
4271 			features &= ~NETIF_F_UFO;
4272 		}
4273 
4274 		if (!(features & NETIF_F_SG)) {
4275 			if (name)
4276 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4277 				       "since no NETIF_F_SG feature.\n", name);
4278 			features &= ~NETIF_F_UFO;
4279 		}
4280 	}
4281 
4282 	return features;
4283 }
4284 EXPORT_SYMBOL(netdev_fix_features);
4285 
4286 /* Some devices need to (re-)set their netdev_ops inside
4287  * ->init() or similar.  If that happens, we have to setup
4288  * the compat pointers again.
4289  */
4290 void netdev_resync_ops(struct net_device *dev)
4291 {
4292 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4293 	const struct net_device_ops *ops = dev->netdev_ops;
4294 
4295 	dev->init = ops->ndo_init;
4296 	dev->uninit = ops->ndo_uninit;
4297 	dev->open = ops->ndo_open;
4298 	dev->change_rx_flags = ops->ndo_change_rx_flags;
4299 	dev->set_rx_mode = ops->ndo_set_rx_mode;
4300 	dev->set_multicast_list = ops->ndo_set_multicast_list;
4301 	dev->set_mac_address = ops->ndo_set_mac_address;
4302 	dev->validate_addr = ops->ndo_validate_addr;
4303 	dev->do_ioctl = ops->ndo_do_ioctl;
4304 	dev->set_config = ops->ndo_set_config;
4305 	dev->change_mtu = ops->ndo_change_mtu;
4306 	dev->neigh_setup = ops->ndo_neigh_setup;
4307 	dev->tx_timeout = ops->ndo_tx_timeout;
4308 	dev->get_stats = ops->ndo_get_stats;
4309 	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4310 	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4311 	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4312 #ifdef CONFIG_NET_POLL_CONTROLLER
4313 	dev->poll_controller = ops->ndo_poll_controller;
4314 #endif
4315 #endif
4316 }
4317 EXPORT_SYMBOL(netdev_resync_ops);
4318 
4319 /**
4320  *	register_netdevice	- register a network device
4321  *	@dev: device to register
4322  *
4323  *	Take a completed network device structure and add it to the kernel
4324  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4325  *	chain. 0 is returned on success. A negative errno code is returned
4326  *	on a failure to set up the device, or if the name is a duplicate.
4327  *
4328  *	Callers must hold the rtnl semaphore. You may want
4329  *	register_netdev() instead of this.
4330  *
4331  *	BUGS:
4332  *	The locking appears insufficient to guarantee two parallel registers
4333  *	will not get the same name.
4334  */
4335 
4336 int register_netdevice(struct net_device *dev)
4337 {
4338 	struct hlist_head *head;
4339 	struct hlist_node *p;
4340 	int ret;
4341 	struct net *net = dev_net(dev);
4342 
4343 	BUG_ON(dev_boot_phase);
4344 	ASSERT_RTNL();
4345 
4346 	might_sleep();
4347 
4348 	/* When net_device's are persistent, this will be fatal. */
4349 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4350 	BUG_ON(!net);
4351 
4352 	spin_lock_init(&dev->addr_list_lock);
4353 	netdev_set_addr_lockdep_class(dev);
4354 	netdev_init_queue_locks(dev);
4355 
4356 	dev->iflink = -1;
4357 
4358 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4359 	/* Netdevice_ops API compatiability support.
4360 	 * This is temporary until all network devices are converted.
4361 	 */
4362 	if (dev->netdev_ops) {
4363 		netdev_resync_ops(dev);
4364 	} else {
4365 		char drivername[64];
4366 		pr_info("%s (%s): not using net_device_ops yet\n",
4367 			dev->name, netdev_drivername(dev, drivername, 64));
4368 
4369 		/* This works only because net_device_ops and the
4370 		   compatiablity structure are the same. */
4371 		dev->netdev_ops = (void *) &(dev->init);
4372 	}
4373 #endif
4374 
4375 	/* Init, if this function is available */
4376 	if (dev->netdev_ops->ndo_init) {
4377 		ret = dev->netdev_ops->ndo_init(dev);
4378 		if (ret) {
4379 			if (ret > 0)
4380 				ret = -EIO;
4381 			goto out;
4382 		}
4383 	}
4384 
4385 	if (!dev_valid_name(dev->name)) {
4386 		ret = -EINVAL;
4387 		goto err_uninit;
4388 	}
4389 
4390 	dev->ifindex = dev_new_index(net);
4391 	if (dev->iflink == -1)
4392 		dev->iflink = dev->ifindex;
4393 
4394 	/* Check for existence of name */
4395 	head = dev_name_hash(net, dev->name);
4396 	hlist_for_each(p, head) {
4397 		struct net_device *d
4398 			= hlist_entry(p, struct net_device, name_hlist);
4399 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4400 			ret = -EEXIST;
4401 			goto err_uninit;
4402 		}
4403 	}
4404 
4405 	/* Fix illegal checksum combinations */
4406 	if ((dev->features & NETIF_F_HW_CSUM) &&
4407 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4408 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4409 		       dev->name);
4410 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4411 	}
4412 
4413 	if ((dev->features & NETIF_F_NO_CSUM) &&
4414 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4415 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4416 		       dev->name);
4417 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4418 	}
4419 
4420 	dev->features = netdev_fix_features(dev->features, dev->name);
4421 
4422 	/* Enable software GSO if SG is supported. */
4423 	if (dev->features & NETIF_F_SG)
4424 		dev->features |= NETIF_F_GSO;
4425 
4426 	netdev_initialize_kobject(dev);
4427 	ret = netdev_register_kobject(dev);
4428 	if (ret)
4429 		goto err_uninit;
4430 	dev->reg_state = NETREG_REGISTERED;
4431 
4432 	/*
4433 	 *	Default initial state at registry is that the
4434 	 *	device is present.
4435 	 */
4436 
4437 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4438 
4439 	dev_init_scheduler(dev);
4440 	dev_hold(dev);
4441 	list_netdevice(dev);
4442 
4443 	/* Notify protocols, that a new device appeared. */
4444 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4445 	ret = notifier_to_errno(ret);
4446 	if (ret) {
4447 		rollback_registered(dev);
4448 		dev->reg_state = NETREG_UNREGISTERED;
4449 	}
4450 
4451 out:
4452 	return ret;
4453 
4454 err_uninit:
4455 	if (dev->netdev_ops->ndo_uninit)
4456 		dev->netdev_ops->ndo_uninit(dev);
4457 	goto out;
4458 }
4459 
4460 /**
4461  *	init_dummy_netdev	- init a dummy network device for NAPI
4462  *	@dev: device to init
4463  *
4464  *	This takes a network device structure and initialize the minimum
4465  *	amount of fields so it can be used to schedule NAPI polls without
4466  *	registering a full blown interface. This is to be used by drivers
4467  *	that need to tie several hardware interfaces to a single NAPI
4468  *	poll scheduler due to HW limitations.
4469  */
4470 int init_dummy_netdev(struct net_device *dev)
4471 {
4472 	/* Clear everything. Note we don't initialize spinlocks
4473 	 * are they aren't supposed to be taken by any of the
4474 	 * NAPI code and this dummy netdev is supposed to be
4475 	 * only ever used for NAPI polls
4476 	 */
4477 	memset(dev, 0, sizeof(struct net_device));
4478 
4479 	/* make sure we BUG if trying to hit standard
4480 	 * register/unregister code path
4481 	 */
4482 	dev->reg_state = NETREG_DUMMY;
4483 
4484 	/* initialize the ref count */
4485 	atomic_set(&dev->refcnt, 1);
4486 
4487 	/* NAPI wants this */
4488 	INIT_LIST_HEAD(&dev->napi_list);
4489 
4490 	/* a dummy interface is started by default */
4491 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4492 	set_bit(__LINK_STATE_START, &dev->state);
4493 
4494 	return 0;
4495 }
4496 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4497 
4498 
4499 /**
4500  *	register_netdev	- register a network device
4501  *	@dev: device to register
4502  *
4503  *	Take a completed network device structure and add it to the kernel
4504  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4505  *	chain. 0 is returned on success. A negative errno code is returned
4506  *	on a failure to set up the device, or if the name is a duplicate.
4507  *
4508  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4509  *	and expands the device name if you passed a format string to
4510  *	alloc_netdev.
4511  */
4512 int register_netdev(struct net_device *dev)
4513 {
4514 	int err;
4515 
4516 	rtnl_lock();
4517 
4518 	/*
4519 	 * If the name is a format string the caller wants us to do a
4520 	 * name allocation.
4521 	 */
4522 	if (strchr(dev->name, '%')) {
4523 		err = dev_alloc_name(dev, dev->name);
4524 		if (err < 0)
4525 			goto out;
4526 	}
4527 
4528 	err = register_netdevice(dev);
4529 out:
4530 	rtnl_unlock();
4531 	return err;
4532 }
4533 EXPORT_SYMBOL(register_netdev);
4534 
4535 /*
4536  * netdev_wait_allrefs - wait until all references are gone.
4537  *
4538  * This is called when unregistering network devices.
4539  *
4540  * Any protocol or device that holds a reference should register
4541  * for netdevice notification, and cleanup and put back the
4542  * reference if they receive an UNREGISTER event.
4543  * We can get stuck here if buggy protocols don't correctly
4544  * call dev_put.
4545  */
4546 static void netdev_wait_allrefs(struct net_device *dev)
4547 {
4548 	unsigned long rebroadcast_time, warning_time;
4549 
4550 	rebroadcast_time = warning_time = jiffies;
4551 	while (atomic_read(&dev->refcnt) != 0) {
4552 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4553 			rtnl_lock();
4554 
4555 			/* Rebroadcast unregister notification */
4556 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4557 
4558 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4559 				     &dev->state)) {
4560 				/* We must not have linkwatch events
4561 				 * pending on unregister. If this
4562 				 * happens, we simply run the queue
4563 				 * unscheduled, resulting in a noop
4564 				 * for this device.
4565 				 */
4566 				linkwatch_run_queue();
4567 			}
4568 
4569 			__rtnl_unlock();
4570 
4571 			rebroadcast_time = jiffies;
4572 		}
4573 
4574 		msleep(250);
4575 
4576 		if (time_after(jiffies, warning_time + 10 * HZ)) {
4577 			printk(KERN_EMERG "unregister_netdevice: "
4578 			       "waiting for %s to become free. Usage "
4579 			       "count = %d\n",
4580 			       dev->name, atomic_read(&dev->refcnt));
4581 			warning_time = jiffies;
4582 		}
4583 	}
4584 }
4585 
4586 /* The sequence is:
4587  *
4588  *	rtnl_lock();
4589  *	...
4590  *	register_netdevice(x1);
4591  *	register_netdevice(x2);
4592  *	...
4593  *	unregister_netdevice(y1);
4594  *	unregister_netdevice(y2);
4595  *      ...
4596  *	rtnl_unlock();
4597  *	free_netdev(y1);
4598  *	free_netdev(y2);
4599  *
4600  * We are invoked by rtnl_unlock().
4601  * This allows us to deal with problems:
4602  * 1) We can delete sysfs objects which invoke hotplug
4603  *    without deadlocking with linkwatch via keventd.
4604  * 2) Since we run with the RTNL semaphore not held, we can sleep
4605  *    safely in order to wait for the netdev refcnt to drop to zero.
4606  *
4607  * We must not return until all unregister events added during
4608  * the interval the lock was held have been completed.
4609  */
4610 void netdev_run_todo(void)
4611 {
4612 	struct list_head list;
4613 
4614 	/* Snapshot list, allow later requests */
4615 	list_replace_init(&net_todo_list, &list);
4616 
4617 	__rtnl_unlock();
4618 
4619 	while (!list_empty(&list)) {
4620 		struct net_device *dev
4621 			= list_entry(list.next, struct net_device, todo_list);
4622 		list_del(&dev->todo_list);
4623 
4624 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4625 			printk(KERN_ERR "network todo '%s' but state %d\n",
4626 			       dev->name, dev->reg_state);
4627 			dump_stack();
4628 			continue;
4629 		}
4630 
4631 		dev->reg_state = NETREG_UNREGISTERED;
4632 
4633 		on_each_cpu(flush_backlog, dev, 1);
4634 
4635 		netdev_wait_allrefs(dev);
4636 
4637 		/* paranoia */
4638 		BUG_ON(atomic_read(&dev->refcnt));
4639 		WARN_ON(dev->ip_ptr);
4640 		WARN_ON(dev->ip6_ptr);
4641 		WARN_ON(dev->dn_ptr);
4642 
4643 		if (dev->destructor)
4644 			dev->destructor(dev);
4645 
4646 		/* Free network device */
4647 		kobject_put(&dev->dev.kobj);
4648 	}
4649 }
4650 
4651 /**
4652  *	dev_get_stats	- get network device statistics
4653  *	@dev: device to get statistics from
4654  *
4655  *	Get network statistics from device. The device driver may provide
4656  *	its own method by setting dev->netdev_ops->get_stats; otherwise
4657  *	the internal statistics structure is used.
4658  */
4659 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4660  {
4661 	const struct net_device_ops *ops = dev->netdev_ops;
4662 
4663 	if (ops->ndo_get_stats)
4664 		return ops->ndo_get_stats(dev);
4665 	else
4666 		return &dev->stats;
4667 }
4668 EXPORT_SYMBOL(dev_get_stats);
4669 
4670 static void netdev_init_one_queue(struct net_device *dev,
4671 				  struct netdev_queue *queue,
4672 				  void *_unused)
4673 {
4674 	queue->dev = dev;
4675 }
4676 
4677 static void netdev_init_queues(struct net_device *dev)
4678 {
4679 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4680 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4681 	spin_lock_init(&dev->tx_global_lock);
4682 }
4683 
4684 /**
4685  *	alloc_netdev_mq - allocate network device
4686  *	@sizeof_priv:	size of private data to allocate space for
4687  *	@name:		device name format string
4688  *	@setup:		callback to initialize device
4689  *	@queue_count:	the number of subqueues to allocate
4690  *
4691  *	Allocates a struct net_device with private data area for driver use
4692  *	and performs basic initialization.  Also allocates subquue structs
4693  *	for each queue on the device at the end of the netdevice.
4694  */
4695 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4696 		void (*setup)(struct net_device *), unsigned int queue_count)
4697 {
4698 	struct netdev_queue *tx;
4699 	struct net_device *dev;
4700 	size_t alloc_size;
4701 	void *p;
4702 
4703 	BUG_ON(strlen(name) >= sizeof(dev->name));
4704 
4705 	alloc_size = sizeof(struct net_device);
4706 	if (sizeof_priv) {
4707 		/* ensure 32-byte alignment of private area */
4708 		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4709 		alloc_size += sizeof_priv;
4710 	}
4711 	/* ensure 32-byte alignment of whole construct */
4712 	alloc_size += NETDEV_ALIGN_CONST;
4713 
4714 	p = kzalloc(alloc_size, GFP_KERNEL);
4715 	if (!p) {
4716 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4717 		return NULL;
4718 	}
4719 
4720 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4721 	if (!tx) {
4722 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4723 		       "tx qdiscs.\n");
4724 		kfree(p);
4725 		return NULL;
4726 	}
4727 
4728 	dev = (struct net_device *)
4729 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4730 	dev->padded = (char *)dev - (char *)p;
4731 	dev_net_set(dev, &init_net);
4732 
4733 	dev->_tx = tx;
4734 	dev->num_tx_queues = queue_count;
4735 	dev->real_num_tx_queues = queue_count;
4736 
4737 	dev->gso_max_size = GSO_MAX_SIZE;
4738 
4739 	netdev_init_queues(dev);
4740 
4741 	INIT_LIST_HEAD(&dev->napi_list);
4742 	setup(dev);
4743 	strcpy(dev->name, name);
4744 	return dev;
4745 }
4746 EXPORT_SYMBOL(alloc_netdev_mq);
4747 
4748 /**
4749  *	free_netdev - free network device
4750  *	@dev: device
4751  *
4752  *	This function does the last stage of destroying an allocated device
4753  * 	interface. The reference to the device object is released.
4754  *	If this is the last reference then it will be freed.
4755  */
4756 void free_netdev(struct net_device *dev)
4757 {
4758 	struct napi_struct *p, *n;
4759 
4760 	release_net(dev_net(dev));
4761 
4762 	kfree(dev->_tx);
4763 
4764 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4765 		netif_napi_del(p);
4766 
4767 	/*  Compatibility with error handling in drivers */
4768 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4769 		kfree((char *)dev - dev->padded);
4770 		return;
4771 	}
4772 
4773 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4774 	dev->reg_state = NETREG_RELEASED;
4775 
4776 	/* will free via device release */
4777 	put_device(&dev->dev);
4778 }
4779 
4780 /**
4781  *	synchronize_net -  Synchronize with packet receive processing
4782  *
4783  *	Wait for packets currently being received to be done.
4784  *	Does not block later packets from starting.
4785  */
4786 void synchronize_net(void)
4787 {
4788 	might_sleep();
4789 	synchronize_rcu();
4790 }
4791 
4792 /**
4793  *	unregister_netdevice - remove device from the kernel
4794  *	@dev: device
4795  *
4796  *	This function shuts down a device interface and removes it
4797  *	from the kernel tables.
4798  *
4799  *	Callers must hold the rtnl semaphore.  You may want
4800  *	unregister_netdev() instead of this.
4801  */
4802 
4803 void unregister_netdevice(struct net_device *dev)
4804 {
4805 	ASSERT_RTNL();
4806 
4807 	rollback_registered(dev);
4808 	/* Finish processing unregister after unlock */
4809 	net_set_todo(dev);
4810 }
4811 
4812 /**
4813  *	unregister_netdev - remove device from the kernel
4814  *	@dev: device
4815  *
4816  *	This function shuts down a device interface and removes it
4817  *	from the kernel tables.
4818  *
4819  *	This is just a wrapper for unregister_netdevice that takes
4820  *	the rtnl semaphore.  In general you want to use this and not
4821  *	unregister_netdevice.
4822  */
4823 void unregister_netdev(struct net_device *dev)
4824 {
4825 	rtnl_lock();
4826 	unregister_netdevice(dev);
4827 	rtnl_unlock();
4828 }
4829 
4830 EXPORT_SYMBOL(unregister_netdev);
4831 
4832 /**
4833  *	dev_change_net_namespace - move device to different nethost namespace
4834  *	@dev: device
4835  *	@net: network namespace
4836  *	@pat: If not NULL name pattern to try if the current device name
4837  *	      is already taken in the destination network namespace.
4838  *
4839  *	This function shuts down a device interface and moves it
4840  *	to a new network namespace. On success 0 is returned, on
4841  *	a failure a netagive errno code is returned.
4842  *
4843  *	Callers must hold the rtnl semaphore.
4844  */
4845 
4846 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4847 {
4848 	char buf[IFNAMSIZ];
4849 	const char *destname;
4850 	int err;
4851 
4852 	ASSERT_RTNL();
4853 
4854 	/* Don't allow namespace local devices to be moved. */
4855 	err = -EINVAL;
4856 	if (dev->features & NETIF_F_NETNS_LOCAL)
4857 		goto out;
4858 
4859 #ifdef CONFIG_SYSFS
4860 	/* Don't allow real devices to be moved when sysfs
4861 	 * is enabled.
4862 	 */
4863 	err = -EINVAL;
4864 	if (dev->dev.parent)
4865 		goto out;
4866 #endif
4867 
4868 	/* Ensure the device has been registrered */
4869 	err = -EINVAL;
4870 	if (dev->reg_state != NETREG_REGISTERED)
4871 		goto out;
4872 
4873 	/* Get out if there is nothing todo */
4874 	err = 0;
4875 	if (net_eq(dev_net(dev), net))
4876 		goto out;
4877 
4878 	/* Pick the destination device name, and ensure
4879 	 * we can use it in the destination network namespace.
4880 	 */
4881 	err = -EEXIST;
4882 	destname = dev->name;
4883 	if (__dev_get_by_name(net, destname)) {
4884 		/* We get here if we can't use the current device name */
4885 		if (!pat)
4886 			goto out;
4887 		if (!dev_valid_name(pat))
4888 			goto out;
4889 		if (strchr(pat, '%')) {
4890 			if (__dev_alloc_name(net, pat, buf) < 0)
4891 				goto out;
4892 			destname = buf;
4893 		} else
4894 			destname = pat;
4895 		if (__dev_get_by_name(net, destname))
4896 			goto out;
4897 	}
4898 
4899 	/*
4900 	 * And now a mini version of register_netdevice unregister_netdevice.
4901 	 */
4902 
4903 	/* If device is running close it first. */
4904 	dev_close(dev);
4905 
4906 	/* And unlink it from device chain */
4907 	err = -ENODEV;
4908 	unlist_netdevice(dev);
4909 
4910 	synchronize_net();
4911 
4912 	/* Shutdown queueing discipline. */
4913 	dev_shutdown(dev);
4914 
4915 	/* Notify protocols, that we are about to destroy
4916 	   this device. They should clean all the things.
4917 	*/
4918 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4919 
4920 	/*
4921 	 *	Flush the unicast and multicast chains
4922 	 */
4923 	dev_addr_discard(dev);
4924 
4925 	netdev_unregister_kobject(dev);
4926 
4927 	/* Actually switch the network namespace */
4928 	dev_net_set(dev, net);
4929 
4930 	/* Assign the new device name */
4931 	if (destname != dev->name)
4932 		strcpy(dev->name, destname);
4933 
4934 	/* If there is an ifindex conflict assign a new one */
4935 	if (__dev_get_by_index(net, dev->ifindex)) {
4936 		int iflink = (dev->iflink == dev->ifindex);
4937 		dev->ifindex = dev_new_index(net);
4938 		if (iflink)
4939 			dev->iflink = dev->ifindex;
4940 	}
4941 
4942 	/* Fixup kobjects */
4943 	err = netdev_register_kobject(dev);
4944 	WARN_ON(err);
4945 
4946 	/* Add the device back in the hashes */
4947 	list_netdevice(dev);
4948 
4949 	/* Notify protocols, that a new device appeared. */
4950 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4951 
4952 	synchronize_net();
4953 	err = 0;
4954 out:
4955 	return err;
4956 }
4957 
4958 static int dev_cpu_callback(struct notifier_block *nfb,
4959 			    unsigned long action,
4960 			    void *ocpu)
4961 {
4962 	struct sk_buff **list_skb;
4963 	struct Qdisc **list_net;
4964 	struct sk_buff *skb;
4965 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4966 	struct softnet_data *sd, *oldsd;
4967 
4968 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4969 		return NOTIFY_OK;
4970 
4971 	local_irq_disable();
4972 	cpu = smp_processor_id();
4973 	sd = &per_cpu(softnet_data, cpu);
4974 	oldsd = &per_cpu(softnet_data, oldcpu);
4975 
4976 	/* Find end of our completion_queue. */
4977 	list_skb = &sd->completion_queue;
4978 	while (*list_skb)
4979 		list_skb = &(*list_skb)->next;
4980 	/* Append completion queue from offline CPU. */
4981 	*list_skb = oldsd->completion_queue;
4982 	oldsd->completion_queue = NULL;
4983 
4984 	/* Find end of our output_queue. */
4985 	list_net = &sd->output_queue;
4986 	while (*list_net)
4987 		list_net = &(*list_net)->next_sched;
4988 	/* Append output queue from offline CPU. */
4989 	*list_net = oldsd->output_queue;
4990 	oldsd->output_queue = NULL;
4991 
4992 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4993 	local_irq_enable();
4994 
4995 	/* Process offline CPU's input_pkt_queue */
4996 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4997 		netif_rx(skb);
4998 
4999 	return NOTIFY_OK;
5000 }
5001 
5002 
5003 /**
5004  *	netdev_increment_features - increment feature set by one
5005  *	@all: current feature set
5006  *	@one: new feature set
5007  *	@mask: mask feature set
5008  *
5009  *	Computes a new feature set after adding a device with feature set
5010  *	@one to the master device with current feature set @all.  Will not
5011  *	enable anything that is off in @mask. Returns the new feature set.
5012  */
5013 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5014 					unsigned long mask)
5015 {
5016 	/* If device needs checksumming, downgrade to it. */
5017         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5018 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5019 	else if (mask & NETIF_F_ALL_CSUM) {
5020 		/* If one device supports v4/v6 checksumming, set for all. */
5021 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5022 		    !(all & NETIF_F_GEN_CSUM)) {
5023 			all &= ~NETIF_F_ALL_CSUM;
5024 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5025 		}
5026 
5027 		/* If one device supports hw checksumming, set for all. */
5028 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5029 			all &= ~NETIF_F_ALL_CSUM;
5030 			all |= NETIF_F_HW_CSUM;
5031 		}
5032 	}
5033 
5034 	one |= NETIF_F_ALL_CSUM;
5035 
5036 	one |= all & NETIF_F_ONE_FOR_ALL;
5037 	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5038 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5039 
5040 	return all;
5041 }
5042 EXPORT_SYMBOL(netdev_increment_features);
5043 
5044 static struct hlist_head *netdev_create_hash(void)
5045 {
5046 	int i;
5047 	struct hlist_head *hash;
5048 
5049 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5050 	if (hash != NULL)
5051 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5052 			INIT_HLIST_HEAD(&hash[i]);
5053 
5054 	return hash;
5055 }
5056 
5057 /* Initialize per network namespace state */
5058 static int __net_init netdev_init(struct net *net)
5059 {
5060 	INIT_LIST_HEAD(&net->dev_base_head);
5061 
5062 	net->dev_name_head = netdev_create_hash();
5063 	if (net->dev_name_head == NULL)
5064 		goto err_name;
5065 
5066 	net->dev_index_head = netdev_create_hash();
5067 	if (net->dev_index_head == NULL)
5068 		goto err_idx;
5069 
5070 	return 0;
5071 
5072 err_idx:
5073 	kfree(net->dev_name_head);
5074 err_name:
5075 	return -ENOMEM;
5076 }
5077 
5078 /**
5079  *	netdev_drivername - network driver for the device
5080  *	@dev: network device
5081  *	@buffer: buffer for resulting name
5082  *	@len: size of buffer
5083  *
5084  *	Determine network driver for device.
5085  */
5086 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5087 {
5088 	const struct device_driver *driver;
5089 	const struct device *parent;
5090 
5091 	if (len <= 0 || !buffer)
5092 		return buffer;
5093 	buffer[0] = 0;
5094 
5095 	parent = dev->dev.parent;
5096 
5097 	if (!parent)
5098 		return buffer;
5099 
5100 	driver = parent->driver;
5101 	if (driver && driver->name)
5102 		strlcpy(buffer, driver->name, len);
5103 	return buffer;
5104 }
5105 
5106 static void __net_exit netdev_exit(struct net *net)
5107 {
5108 	kfree(net->dev_name_head);
5109 	kfree(net->dev_index_head);
5110 }
5111 
5112 static struct pernet_operations __net_initdata netdev_net_ops = {
5113 	.init = netdev_init,
5114 	.exit = netdev_exit,
5115 };
5116 
5117 static void __net_exit default_device_exit(struct net *net)
5118 {
5119 	struct net_device *dev;
5120 	/*
5121 	 * Push all migratable of the network devices back to the
5122 	 * initial network namespace
5123 	 */
5124 	rtnl_lock();
5125 restart:
5126 	for_each_netdev(net, dev) {
5127 		int err;
5128 		char fb_name[IFNAMSIZ];
5129 
5130 		/* Ignore unmoveable devices (i.e. loopback) */
5131 		if (dev->features & NETIF_F_NETNS_LOCAL)
5132 			continue;
5133 
5134 		/* Delete virtual devices */
5135 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5136 			dev->rtnl_link_ops->dellink(dev);
5137 			goto restart;
5138 		}
5139 
5140 		/* Push remaing network devices to init_net */
5141 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5142 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5143 		if (err) {
5144 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5145 				__func__, dev->name, err);
5146 			BUG();
5147 		}
5148 		goto restart;
5149 	}
5150 	rtnl_unlock();
5151 }
5152 
5153 static struct pernet_operations __net_initdata default_device_ops = {
5154 	.exit = default_device_exit,
5155 };
5156 
5157 /*
5158  *	Initialize the DEV module. At boot time this walks the device list and
5159  *	unhooks any devices that fail to initialise (normally hardware not
5160  *	present) and leaves us with a valid list of present and active devices.
5161  *
5162  */
5163 
5164 /*
5165  *       This is called single threaded during boot, so no need
5166  *       to take the rtnl semaphore.
5167  */
5168 static int __init net_dev_init(void)
5169 {
5170 	int i, rc = -ENOMEM;
5171 
5172 	BUG_ON(!dev_boot_phase);
5173 
5174 	if (dev_proc_init())
5175 		goto out;
5176 
5177 	if (netdev_kobject_init())
5178 		goto out;
5179 
5180 	INIT_LIST_HEAD(&ptype_all);
5181 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5182 		INIT_LIST_HEAD(&ptype_base[i]);
5183 
5184 	if (register_pernet_subsys(&netdev_net_ops))
5185 		goto out;
5186 
5187 	/*
5188 	 *	Initialise the packet receive queues.
5189 	 */
5190 
5191 	for_each_possible_cpu(i) {
5192 		struct softnet_data *queue;
5193 
5194 		queue = &per_cpu(softnet_data, i);
5195 		skb_queue_head_init(&queue->input_pkt_queue);
5196 		queue->completion_queue = NULL;
5197 		INIT_LIST_HEAD(&queue->poll_list);
5198 
5199 		queue->backlog.poll = process_backlog;
5200 		queue->backlog.weight = weight_p;
5201 		queue->backlog.gro_list = NULL;
5202 	}
5203 
5204 	dev_boot_phase = 0;
5205 
5206 	/* The loopback device is special if any other network devices
5207 	 * is present in a network namespace the loopback device must
5208 	 * be present. Since we now dynamically allocate and free the
5209 	 * loopback device ensure this invariant is maintained by
5210 	 * keeping the loopback device as the first device on the
5211 	 * list of network devices.  Ensuring the loopback devices
5212 	 * is the first device that appears and the last network device
5213 	 * that disappears.
5214 	 */
5215 	if (register_pernet_device(&loopback_net_ops))
5216 		goto out;
5217 
5218 	if (register_pernet_device(&default_device_ops))
5219 		goto out;
5220 
5221 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5222 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5223 
5224 	hotcpu_notifier(dev_cpu_callback, 0);
5225 	dst_init();
5226 	dev_mcast_init();
5227 	rc = 0;
5228 out:
5229 	return rc;
5230 }
5231 
5232 subsys_initcall(net_dev_init);
5233 
5234 EXPORT_SYMBOL(__dev_get_by_index);
5235 EXPORT_SYMBOL(__dev_get_by_name);
5236 EXPORT_SYMBOL(__dev_remove_pack);
5237 EXPORT_SYMBOL(dev_valid_name);
5238 EXPORT_SYMBOL(dev_add_pack);
5239 EXPORT_SYMBOL(dev_alloc_name);
5240 EXPORT_SYMBOL(dev_close);
5241 EXPORT_SYMBOL(dev_get_by_flags);
5242 EXPORT_SYMBOL(dev_get_by_index);
5243 EXPORT_SYMBOL(dev_get_by_name);
5244 EXPORT_SYMBOL(dev_open);
5245 EXPORT_SYMBOL(dev_queue_xmit);
5246 EXPORT_SYMBOL(dev_remove_pack);
5247 EXPORT_SYMBOL(dev_set_allmulti);
5248 EXPORT_SYMBOL(dev_set_promiscuity);
5249 EXPORT_SYMBOL(dev_change_flags);
5250 EXPORT_SYMBOL(dev_set_mtu);
5251 EXPORT_SYMBOL(dev_set_mac_address);
5252 EXPORT_SYMBOL(free_netdev);
5253 EXPORT_SYMBOL(netdev_boot_setup_check);
5254 EXPORT_SYMBOL(netdev_set_master);
5255 EXPORT_SYMBOL(netdev_state_change);
5256 EXPORT_SYMBOL(netif_receive_skb);
5257 EXPORT_SYMBOL(netif_rx);
5258 EXPORT_SYMBOL(register_gifconf);
5259 EXPORT_SYMBOL(register_netdevice);
5260 EXPORT_SYMBOL(register_netdevice_notifier);
5261 EXPORT_SYMBOL(skb_checksum_help);
5262 EXPORT_SYMBOL(synchronize_net);
5263 EXPORT_SYMBOL(unregister_netdevice);
5264 EXPORT_SYMBOL(unregister_netdevice_notifier);
5265 EXPORT_SYMBOL(net_enable_timestamp);
5266 EXPORT_SYMBOL(net_disable_timestamp);
5267 EXPORT_SYMBOL(dev_get_flags);
5268 
5269 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5270 EXPORT_SYMBOL(br_handle_frame_hook);
5271 EXPORT_SYMBOL(br_fdb_get_hook);
5272 EXPORT_SYMBOL(br_fdb_put_hook);
5273 #endif
5274 
5275 EXPORT_SYMBOL(dev_load);
5276 
5277 EXPORT_PER_CPU_SYMBOL(softnet_data);
5278