xref: /linux/net/core/dev.c (revision a5c4300389bb33ade2515c082709217f0614cf15)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <linux/if_bridge.h>
105 #include <linux/if_macvlan.h>
106 #include <net/dst.h>
107 #include <net/pkt_sched.h>
108 #include <net/checksum.h>
109 #include <net/xfrm.h>
110 #include <linux/highmem.h>
111 #include <linux/init.h>
112 #include <linux/kmod.h>
113 #include <linux/module.h>
114 #include <linux/netpoll.h>
115 #include <linux/rcupdate.h>
116 #include <linux/delay.h>
117 #include <net/wext.h>
118 #include <net/iw_handler.h>
119 #include <asm/current.h>
120 #include <linux/audit.h>
121 #include <linux/dmaengine.h>
122 #include <linux/err.h>
123 #include <linux/ctype.h>
124 #include <linux/if_arp.h>
125 #include <linux/if_vlan.h>
126 #include <linux/ip.h>
127 #include <net/ip.h>
128 #include <linux/ipv6.h>
129 #include <linux/in.h>
130 #include <linux/jhash.h>
131 #include <linux/random.h>
132 #include <trace/events/napi.h>
133 #include <linux/pci.h>
134 
135 #include "net-sysfs.h"
136 
137 /* Instead of increasing this, you should create a hash table. */
138 #define MAX_GRO_SKBS 8
139 
140 /* This should be increased if a protocol with a bigger head is added. */
141 #define GRO_MAX_HEAD (MAX_HEADER + 128)
142 
143 /*
144  *	The list of packet types we will receive (as opposed to discard)
145  *	and the routines to invoke.
146  *
147  *	Why 16. Because with 16 the only overlap we get on a hash of the
148  *	low nibble of the protocol value is RARP/SNAP/X.25.
149  *
150  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
151  *             sure which should go first, but I bet it won't make much
152  *             difference if we are running VLANs.  The good news is that
153  *             this protocol won't be in the list unless compiled in, so
154  *             the average user (w/out VLANs) will not be adversely affected.
155  *             --BLG
156  *
157  *		0800	IP
158  *		8100    802.1Q VLAN
159  *		0001	802.3
160  *		0002	AX.25
161  *		0004	802.2
162  *		8035	RARP
163  *		0005	SNAP
164  *		0805	X.25
165  *		0806	ARP
166  *		8137	IPX
167  *		0009	Localtalk
168  *		86DD	IPv6
169  */
170 
171 #define PTYPE_HASH_SIZE	(16)
172 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
173 
174 static DEFINE_SPINLOCK(ptype_lock);
175 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
176 static struct list_head ptype_all __read_mostly;	/* Taps */
177 
178 /*
179  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
180  * semaphore.
181  *
182  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
183  *
184  * Writers must hold the rtnl semaphore while they loop through the
185  * dev_base_head list, and hold dev_base_lock for writing when they do the
186  * actual updates.  This allows pure readers to access the list even
187  * while a writer is preparing to update it.
188  *
189  * To put it another way, dev_base_lock is held for writing only to
190  * protect against pure readers; the rtnl semaphore provides the
191  * protection against other writers.
192  *
193  * See, for example usages, register_netdevice() and
194  * unregister_netdevice(), which must be called with the rtnl
195  * semaphore held.
196  */
197 DEFINE_RWLOCK(dev_base_lock);
198 EXPORT_SYMBOL(dev_base_lock);
199 
200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
201 {
202 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
203 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
204 }
205 
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 {
208 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
209 }
210 
211 static inline void rps_lock(struct softnet_data *sd)
212 {
213 #ifdef CONFIG_RPS
214 	spin_lock(&sd->input_pkt_queue.lock);
215 #endif
216 }
217 
218 static inline void rps_unlock(struct softnet_data *sd)
219 {
220 #ifdef CONFIG_RPS
221 	spin_unlock(&sd->input_pkt_queue.lock);
222 #endif
223 }
224 
225 /* Device list insertion */
226 static int list_netdevice(struct net_device *dev)
227 {
228 	struct net *net = dev_net(dev);
229 
230 	ASSERT_RTNL();
231 
232 	write_lock_bh(&dev_base_lock);
233 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
234 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
235 	hlist_add_head_rcu(&dev->index_hlist,
236 			   dev_index_hash(net, dev->ifindex));
237 	write_unlock_bh(&dev_base_lock);
238 	return 0;
239 }
240 
241 /* Device list removal
242  * caller must respect a RCU grace period before freeing/reusing dev
243  */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246 	ASSERT_RTNL();
247 
248 	/* Unlink dev from the device chain */
249 	write_lock_bh(&dev_base_lock);
250 	list_del_rcu(&dev->dev_list);
251 	hlist_del_rcu(&dev->name_hlist);
252 	hlist_del_rcu(&dev->index_hlist);
253 	write_unlock_bh(&dev_base_lock);
254 }
255 
256 /*
257  *	Our notifier list
258  */
259 
260 static RAW_NOTIFIER_HEAD(netdev_chain);
261 
262 /*
263  *	Device drivers call our routines to queue packets here. We empty the
264  *	queue in the local softnet handler.
265  */
266 
267 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
268 EXPORT_PER_CPU_SYMBOL(softnet_data);
269 
270 #ifdef CONFIG_LOCKDEP
271 /*
272  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
273  * according to dev->type
274  */
275 static const unsigned short netdev_lock_type[] =
276 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
289 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
290 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
291 	 ARPHRD_VOID, ARPHRD_NONE};
292 
293 static const char *const netdev_lock_name[] =
294 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
307 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
308 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
309 	 "_xmit_VOID", "_xmit_NONE"};
310 
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316 	int i;
317 
318 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 		if (netdev_lock_type[i] == dev_type)
320 			return i;
321 	/* the last key is used by default */
322 	return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324 
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 						 unsigned short dev_type)
327 {
328 	int i;
329 
330 	i = netdev_lock_pos(dev_type);
331 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 				   netdev_lock_name[i]);
333 }
334 
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337 	int i;
338 
339 	i = netdev_lock_pos(dev->type);
340 	lockdep_set_class_and_name(&dev->addr_list_lock,
341 				   &netdev_addr_lock_key[i],
342 				   netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 						 unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353 
354 /*******************************************************************************
355 
356 		Protocol management and registration routines
357 
358 *******************************************************************************/
359 
360 /*
361  *	Add a protocol ID to the list. Now that the input handler is
362  *	smarter we can dispense with all the messy stuff that used to be
363  *	here.
364  *
365  *	BEWARE!!! Protocol handlers, mangling input packets,
366  *	MUST BE last in hash buckets and checking protocol handlers
367  *	MUST start from promiscuous ptype_all chain in net_bh.
368  *	It is true now, do not change it.
369  *	Explanation follows: if protocol handler, mangling packet, will
370  *	be the first on list, it is not able to sense, that packet
371  *	is cloned and should be copied-on-write, so that it will
372  *	change it and subsequent readers will get broken packet.
373  *							--ANK (980803)
374  */
375 
376 /**
377  *	dev_add_pack - add packet handler
378  *	@pt: packet type declaration
379  *
380  *	Add a protocol handler to the networking stack. The passed &packet_type
381  *	is linked into kernel lists and may not be freed until it has been
382  *	removed from the kernel lists.
383  *
384  *	This call does not sleep therefore it can not
385  *	guarantee all CPU's that are in middle of receiving packets
386  *	will see the new packet type (until the next received packet).
387  */
388 
389 void dev_add_pack(struct packet_type *pt)
390 {
391 	int hash;
392 
393 	spin_lock_bh(&ptype_lock);
394 	if (pt->type == htons(ETH_P_ALL))
395 		list_add_rcu(&pt->list, &ptype_all);
396 	else {
397 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
398 		list_add_rcu(&pt->list, &ptype_base[hash]);
399 	}
400 	spin_unlock_bh(&ptype_lock);
401 }
402 EXPORT_SYMBOL(dev_add_pack);
403 
404 /**
405  *	__dev_remove_pack	 - remove packet handler
406  *	@pt: packet type declaration
407  *
408  *	Remove a protocol handler that was previously added to the kernel
409  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
410  *	from the kernel lists and can be freed or reused once this function
411  *	returns.
412  *
413  *      The packet type might still be in use by receivers
414  *	and must not be freed until after all the CPU's have gone
415  *	through a quiescent state.
416  */
417 void __dev_remove_pack(struct packet_type *pt)
418 {
419 	struct list_head *head;
420 	struct packet_type *pt1;
421 
422 	spin_lock_bh(&ptype_lock);
423 
424 	if (pt->type == htons(ETH_P_ALL))
425 		head = &ptype_all;
426 	else
427 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
428 
429 	list_for_each_entry(pt1, head, list) {
430 		if (pt == pt1) {
431 			list_del_rcu(&pt->list);
432 			goto out;
433 		}
434 	}
435 
436 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 	spin_unlock_bh(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441 
442 /**
443  *	dev_remove_pack	 - remove packet handler
444  *	@pt: packet type declaration
445  *
446  *	Remove a protocol handler that was previously added to the kernel
447  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448  *	from the kernel lists and can be freed or reused once this function
449  *	returns.
450  *
451  *	This call sleeps to guarantee that no CPU is looking at the packet
452  *	type after return.
453  */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 	__dev_remove_pack(pt);
457 
458 	synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461 
462 /******************************************************************************
463 
464 		      Device Boot-time Settings Routines
465 
466 *******************************************************************************/
467 
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470 
471 /**
472  *	netdev_boot_setup_add	- add new setup entry
473  *	@name: name of the device
474  *	@map: configured settings for the device
475  *
476  *	Adds new setup entry to the dev_boot_setup list.  The function
477  *	returns 0 on error and 1 on success.  This is a generic routine to
478  *	all netdevices.
479  */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 	struct netdev_boot_setup *s;
483 	int i;
484 
485 	s = dev_boot_setup;
486 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 			memset(s[i].name, 0, sizeof(s[i].name));
489 			strlcpy(s[i].name, name, IFNAMSIZ);
490 			memcpy(&s[i].map, map, sizeof(s[i].map));
491 			break;
492 		}
493 	}
494 
495 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497 
498 /**
499  *	netdev_boot_setup_check	- check boot time settings
500  *	@dev: the netdevice
501  *
502  * 	Check boot time settings for the device.
503  *	The found settings are set for the device to be used
504  *	later in the device probing.
505  *	Returns 0 if no settings found, 1 if they are.
506  */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 	struct netdev_boot_setup *s = dev_boot_setup;
510 	int i;
511 
512 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 		    !strcmp(dev->name, s[i].name)) {
515 			dev->irq 	= s[i].map.irq;
516 			dev->base_addr 	= s[i].map.base_addr;
517 			dev->mem_start 	= s[i].map.mem_start;
518 			dev->mem_end 	= s[i].map.mem_end;
519 			return 1;
520 		}
521 	}
522 	return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525 
526 
527 /**
528  *	netdev_boot_base	- get address from boot time settings
529  *	@prefix: prefix for network device
530  *	@unit: id for network device
531  *
532  * 	Check boot time settings for the base address of device.
533  *	The found settings are set for the device to be used
534  *	later in the device probing.
535  *	Returns 0 if no settings found.
536  */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 	const struct netdev_boot_setup *s = dev_boot_setup;
540 	char name[IFNAMSIZ];
541 	int i;
542 
543 	sprintf(name, "%s%d", prefix, unit);
544 
545 	/*
546 	 * If device already registered then return base of 1
547 	 * to indicate not to probe for this interface
548 	 */
549 	if (__dev_get_by_name(&init_net, name))
550 		return 1;
551 
552 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 		if (!strcmp(name, s[i].name))
554 			return s[i].map.base_addr;
555 	return 0;
556 }
557 
558 /*
559  * Saves at boot time configured settings for any netdevice.
560  */
561 int __init netdev_boot_setup(char *str)
562 {
563 	int ints[5];
564 	struct ifmap map;
565 
566 	str = get_options(str, ARRAY_SIZE(ints), ints);
567 	if (!str || !*str)
568 		return 0;
569 
570 	/* Save settings */
571 	memset(&map, 0, sizeof(map));
572 	if (ints[0] > 0)
573 		map.irq = ints[1];
574 	if (ints[0] > 1)
575 		map.base_addr = ints[2];
576 	if (ints[0] > 2)
577 		map.mem_start = ints[3];
578 	if (ints[0] > 3)
579 		map.mem_end = ints[4];
580 
581 	/* Add new entry to the list */
582 	return netdev_boot_setup_add(str, &map);
583 }
584 
585 __setup("netdev=", netdev_boot_setup);
586 
587 /*******************************************************************************
588 
589 			    Device Interface Subroutines
590 
591 *******************************************************************************/
592 
593 /**
594  *	__dev_get_by_name	- find a device by its name
595  *	@net: the applicable net namespace
596  *	@name: name to find
597  *
598  *	Find an interface by name. Must be called under RTNL semaphore
599  *	or @dev_base_lock. If the name is found a pointer to the device
600  *	is returned. If the name is not found then %NULL is returned. The
601  *	reference counters are not incremented so the caller must be
602  *	careful with locks.
603  */
604 
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 	struct hlist_node *p;
608 	struct net_device *dev;
609 	struct hlist_head *head = dev_name_hash(net, name);
610 
611 	hlist_for_each_entry(dev, p, head, name_hlist)
612 		if (!strncmp(dev->name, name, IFNAMSIZ))
613 			return dev;
614 
615 	return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618 
619 /**
620  *	dev_get_by_name_rcu	- find a device by its name
621  *	@net: the applicable net namespace
622  *	@name: name to find
623  *
624  *	Find an interface by name.
625  *	If the name is found a pointer to the device is returned.
626  * 	If the name is not found then %NULL is returned.
627  *	The reference counters are not incremented so the caller must be
628  *	careful with locks. The caller must hold RCU lock.
629  */
630 
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 	struct hlist_node *p;
634 	struct net_device *dev;
635 	struct hlist_head *head = dev_name_hash(net, name);
636 
637 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 		if (!strncmp(dev->name, name, IFNAMSIZ))
639 			return dev;
640 
641 	return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644 
645 /**
646  *	dev_get_by_name		- find a device by its name
647  *	@net: the applicable net namespace
648  *	@name: name to find
649  *
650  *	Find an interface by name. This can be called from any
651  *	context and does its own locking. The returned handle has
652  *	the usage count incremented and the caller must use dev_put() to
653  *	release it when it is no longer needed. %NULL is returned if no
654  *	matching device is found.
655  */
656 
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 	struct net_device *dev;
660 
661 	rcu_read_lock();
662 	dev = dev_get_by_name_rcu(net, name);
663 	if (dev)
664 		dev_hold(dev);
665 	rcu_read_unlock();
666 	return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669 
670 /**
671  *	__dev_get_by_index - find a device by its ifindex
672  *	@net: the applicable net namespace
673  *	@ifindex: index of device
674  *
675  *	Search for an interface by index. Returns %NULL if the device
676  *	is not found or a pointer to the device. The device has not
677  *	had its reference counter increased so the caller must be careful
678  *	about locking. The caller must hold either the RTNL semaphore
679  *	or @dev_base_lock.
680  */
681 
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 	struct hlist_node *p;
685 	struct net_device *dev;
686 	struct hlist_head *head = dev_index_hash(net, ifindex);
687 
688 	hlist_for_each_entry(dev, p, head, index_hlist)
689 		if (dev->ifindex == ifindex)
690 			return dev;
691 
692 	return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695 
696 /**
697  *	dev_get_by_index_rcu - find a device by its ifindex
698  *	@net: the applicable net namespace
699  *	@ifindex: index of device
700  *
701  *	Search for an interface by index. Returns %NULL if the device
702  *	is not found or a pointer to the device. The device has not
703  *	had its reference counter increased so the caller must be careful
704  *	about locking. The caller must hold RCU lock.
705  */
706 
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 	struct hlist_node *p;
710 	struct net_device *dev;
711 	struct hlist_head *head = dev_index_hash(net, ifindex);
712 
713 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 		if (dev->ifindex == ifindex)
715 			return dev;
716 
717 	return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720 
721 
722 /**
723  *	dev_get_by_index - find a device by its ifindex
724  *	@net: the applicable net namespace
725  *	@ifindex: index of device
726  *
727  *	Search for an interface by index. Returns NULL if the device
728  *	is not found or a pointer to the device. The device returned has
729  *	had a reference added and the pointer is safe until the user calls
730  *	dev_put to indicate they have finished with it.
731  */
732 
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 	struct net_device *dev;
736 
737 	rcu_read_lock();
738 	dev = dev_get_by_index_rcu(net, ifindex);
739 	if (dev)
740 		dev_hold(dev);
741 	rcu_read_unlock();
742 	return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745 
746 /**
747  *	dev_getbyhwaddr - find a device by its hardware address
748  *	@net: the applicable net namespace
749  *	@type: media type of device
750  *	@ha: hardware address
751  *
752  *	Search for an interface by MAC address. Returns NULL if the device
753  *	is not found or a pointer to the device. The caller must hold the
754  *	rtnl semaphore. The returned device has not had its ref count increased
755  *	and the caller must therefore be careful about locking
756  *
757  *	BUGS:
758  *	If the API was consistent this would be __dev_get_by_hwaddr
759  */
760 
761 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
762 {
763 	struct net_device *dev;
764 
765 	ASSERT_RTNL();
766 
767 	for_each_netdev(net, dev)
768 		if (dev->type == type &&
769 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
770 			return dev;
771 
772 	return NULL;
773 }
774 EXPORT_SYMBOL(dev_getbyhwaddr);
775 
776 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
777 {
778 	struct net_device *dev;
779 
780 	ASSERT_RTNL();
781 	for_each_netdev(net, dev)
782 		if (dev->type == type)
783 			return dev;
784 
785 	return NULL;
786 }
787 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
788 
789 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
790 {
791 	struct net_device *dev, *ret = NULL;
792 
793 	rcu_read_lock();
794 	for_each_netdev_rcu(net, dev)
795 		if (dev->type == type) {
796 			dev_hold(dev);
797 			ret = dev;
798 			break;
799 		}
800 	rcu_read_unlock();
801 	return ret;
802 }
803 EXPORT_SYMBOL(dev_getfirstbyhwtype);
804 
805 /**
806  *	dev_get_by_flags - find any device with given flags
807  *	@net: the applicable net namespace
808  *	@if_flags: IFF_* values
809  *	@mask: bitmask of bits in if_flags to check
810  *
811  *	Search for any interface with the given flags. Returns NULL if a device
812  *	is not found or a pointer to the device. The device returned has
813  *	had a reference added and the pointer is safe until the user calls
814  *	dev_put to indicate they have finished with it.
815  */
816 
817 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
818 				    unsigned short mask)
819 {
820 	struct net_device *dev, *ret;
821 
822 	ret = NULL;
823 	rcu_read_lock();
824 	for_each_netdev_rcu(net, dev) {
825 		if (((dev->flags ^ if_flags) & mask) == 0) {
826 			dev_hold(dev);
827 			ret = dev;
828 			break;
829 		}
830 	}
831 	rcu_read_unlock();
832 	return ret;
833 }
834 EXPORT_SYMBOL(dev_get_by_flags);
835 
836 /**
837  *	dev_valid_name - check if name is okay for network device
838  *	@name: name string
839  *
840  *	Network device names need to be valid file names to
841  *	to allow sysfs to work.  We also disallow any kind of
842  *	whitespace.
843  */
844 int dev_valid_name(const char *name)
845 {
846 	if (*name == '\0')
847 		return 0;
848 	if (strlen(name) >= IFNAMSIZ)
849 		return 0;
850 	if (!strcmp(name, ".") || !strcmp(name, ".."))
851 		return 0;
852 
853 	while (*name) {
854 		if (*name == '/' || isspace(*name))
855 			return 0;
856 		name++;
857 	}
858 	return 1;
859 }
860 EXPORT_SYMBOL(dev_valid_name);
861 
862 /**
863  *	__dev_alloc_name - allocate a name for a device
864  *	@net: network namespace to allocate the device name in
865  *	@name: name format string
866  *	@buf:  scratch buffer and result name string
867  *
868  *	Passed a format string - eg "lt%d" it will try and find a suitable
869  *	id. It scans list of devices to build up a free map, then chooses
870  *	the first empty slot. The caller must hold the dev_base or rtnl lock
871  *	while allocating the name and adding the device in order to avoid
872  *	duplicates.
873  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
874  *	Returns the number of the unit assigned or a negative errno code.
875  */
876 
877 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
878 {
879 	int i = 0;
880 	const char *p;
881 	const int max_netdevices = 8*PAGE_SIZE;
882 	unsigned long *inuse;
883 	struct net_device *d;
884 
885 	p = strnchr(name, IFNAMSIZ-1, '%');
886 	if (p) {
887 		/*
888 		 * Verify the string as this thing may have come from
889 		 * the user.  There must be either one "%d" and no other "%"
890 		 * characters.
891 		 */
892 		if (p[1] != 'd' || strchr(p + 2, '%'))
893 			return -EINVAL;
894 
895 		/* Use one page as a bit array of possible slots */
896 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
897 		if (!inuse)
898 			return -ENOMEM;
899 
900 		for_each_netdev(net, d) {
901 			if (!sscanf(d->name, name, &i))
902 				continue;
903 			if (i < 0 || i >= max_netdevices)
904 				continue;
905 
906 			/*  avoid cases where sscanf is not exact inverse of printf */
907 			snprintf(buf, IFNAMSIZ, name, i);
908 			if (!strncmp(buf, d->name, IFNAMSIZ))
909 				set_bit(i, inuse);
910 		}
911 
912 		i = find_first_zero_bit(inuse, max_netdevices);
913 		free_page((unsigned long) inuse);
914 	}
915 
916 	if (buf != name)
917 		snprintf(buf, IFNAMSIZ, name, i);
918 	if (!__dev_get_by_name(net, buf))
919 		return i;
920 
921 	/* It is possible to run out of possible slots
922 	 * when the name is long and there isn't enough space left
923 	 * for the digits, or if all bits are used.
924 	 */
925 	return -ENFILE;
926 }
927 
928 /**
929  *	dev_alloc_name - allocate a name for a device
930  *	@dev: device
931  *	@name: name format string
932  *
933  *	Passed a format string - eg "lt%d" it will try and find a suitable
934  *	id. It scans list of devices to build up a free map, then chooses
935  *	the first empty slot. The caller must hold the dev_base or rtnl lock
936  *	while allocating the name and adding the device in order to avoid
937  *	duplicates.
938  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
939  *	Returns the number of the unit assigned or a negative errno code.
940  */
941 
942 int dev_alloc_name(struct net_device *dev, const char *name)
943 {
944 	char buf[IFNAMSIZ];
945 	struct net *net;
946 	int ret;
947 
948 	BUG_ON(!dev_net(dev));
949 	net = dev_net(dev);
950 	ret = __dev_alloc_name(net, name, buf);
951 	if (ret >= 0)
952 		strlcpy(dev->name, buf, IFNAMSIZ);
953 	return ret;
954 }
955 EXPORT_SYMBOL(dev_alloc_name);
956 
957 static int dev_get_valid_name(struct net *net, const char *name, char *buf,
958 			      bool fmt)
959 {
960 	if (!dev_valid_name(name))
961 		return -EINVAL;
962 
963 	if (fmt && strchr(name, '%'))
964 		return __dev_alloc_name(net, name, buf);
965 	else if (__dev_get_by_name(net, name))
966 		return -EEXIST;
967 	else if (buf != name)
968 		strlcpy(buf, name, IFNAMSIZ);
969 
970 	return 0;
971 }
972 
973 /**
974  *	dev_change_name - change name of a device
975  *	@dev: device
976  *	@newname: name (or format string) must be at least IFNAMSIZ
977  *
978  *	Change name of a device, can pass format strings "eth%d".
979  *	for wildcarding.
980  */
981 int dev_change_name(struct net_device *dev, const char *newname)
982 {
983 	char oldname[IFNAMSIZ];
984 	int err = 0;
985 	int ret;
986 	struct net *net;
987 
988 	ASSERT_RTNL();
989 	BUG_ON(!dev_net(dev));
990 
991 	net = dev_net(dev);
992 	if (dev->flags & IFF_UP)
993 		return -EBUSY;
994 
995 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
996 		return 0;
997 
998 	memcpy(oldname, dev->name, IFNAMSIZ);
999 
1000 	err = dev_get_valid_name(net, newname, dev->name, 1);
1001 	if (err < 0)
1002 		return err;
1003 
1004 rollback:
1005 	ret = device_rename(&dev->dev, dev->name);
1006 	if (ret) {
1007 		memcpy(dev->name, oldname, IFNAMSIZ);
1008 		return ret;
1009 	}
1010 
1011 	write_lock_bh(&dev_base_lock);
1012 	hlist_del(&dev->name_hlist);
1013 	write_unlock_bh(&dev_base_lock);
1014 
1015 	synchronize_rcu();
1016 
1017 	write_lock_bh(&dev_base_lock);
1018 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1019 	write_unlock_bh(&dev_base_lock);
1020 
1021 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1022 	ret = notifier_to_errno(ret);
1023 
1024 	if (ret) {
1025 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1026 		if (err >= 0) {
1027 			err = ret;
1028 			memcpy(dev->name, oldname, IFNAMSIZ);
1029 			goto rollback;
1030 		} else {
1031 			printk(KERN_ERR
1032 			       "%s: name change rollback failed: %d.\n",
1033 			       dev->name, ret);
1034 		}
1035 	}
1036 
1037 	return err;
1038 }
1039 
1040 /**
1041  *	dev_set_alias - change ifalias of a device
1042  *	@dev: device
1043  *	@alias: name up to IFALIASZ
1044  *	@len: limit of bytes to copy from info
1045  *
1046  *	Set ifalias for a device,
1047  */
1048 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1049 {
1050 	ASSERT_RTNL();
1051 
1052 	if (len >= IFALIASZ)
1053 		return -EINVAL;
1054 
1055 	if (!len) {
1056 		if (dev->ifalias) {
1057 			kfree(dev->ifalias);
1058 			dev->ifalias = NULL;
1059 		}
1060 		return 0;
1061 	}
1062 
1063 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1064 	if (!dev->ifalias)
1065 		return -ENOMEM;
1066 
1067 	strlcpy(dev->ifalias, alias, len+1);
1068 	return len;
1069 }
1070 
1071 
1072 /**
1073  *	netdev_features_change - device changes features
1074  *	@dev: device to cause notification
1075  *
1076  *	Called to indicate a device has changed features.
1077  */
1078 void netdev_features_change(struct net_device *dev)
1079 {
1080 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1081 }
1082 EXPORT_SYMBOL(netdev_features_change);
1083 
1084 /**
1085  *	netdev_state_change - device changes state
1086  *	@dev: device to cause notification
1087  *
1088  *	Called to indicate a device has changed state. This function calls
1089  *	the notifier chains for netdev_chain and sends a NEWLINK message
1090  *	to the routing socket.
1091  */
1092 void netdev_state_change(struct net_device *dev)
1093 {
1094 	if (dev->flags & IFF_UP) {
1095 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1096 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1097 	}
1098 }
1099 EXPORT_SYMBOL(netdev_state_change);
1100 
1101 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1102 {
1103 	return call_netdevice_notifiers(event, dev);
1104 }
1105 EXPORT_SYMBOL(netdev_bonding_change);
1106 
1107 /**
1108  *	dev_load 	- load a network module
1109  *	@net: the applicable net namespace
1110  *	@name: name of interface
1111  *
1112  *	If a network interface is not present and the process has suitable
1113  *	privileges this function loads the module. If module loading is not
1114  *	available in this kernel then it becomes a nop.
1115  */
1116 
1117 void dev_load(struct net *net, const char *name)
1118 {
1119 	struct net_device *dev;
1120 
1121 	rcu_read_lock();
1122 	dev = dev_get_by_name_rcu(net, name);
1123 	rcu_read_unlock();
1124 
1125 	if (!dev && capable(CAP_NET_ADMIN))
1126 		request_module("%s", name);
1127 }
1128 EXPORT_SYMBOL(dev_load);
1129 
1130 static int __dev_open(struct net_device *dev)
1131 {
1132 	const struct net_device_ops *ops = dev->netdev_ops;
1133 	int ret;
1134 
1135 	ASSERT_RTNL();
1136 
1137 	/*
1138 	 *	Is it even present?
1139 	 */
1140 	if (!netif_device_present(dev))
1141 		return -ENODEV;
1142 
1143 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1144 	ret = notifier_to_errno(ret);
1145 	if (ret)
1146 		return ret;
1147 
1148 	/*
1149 	 *	Call device private open method
1150 	 */
1151 	set_bit(__LINK_STATE_START, &dev->state);
1152 
1153 	if (ops->ndo_validate_addr)
1154 		ret = ops->ndo_validate_addr(dev);
1155 
1156 	if (!ret && ops->ndo_open)
1157 		ret = ops->ndo_open(dev);
1158 
1159 	/*
1160 	 *	If it went open OK then:
1161 	 */
1162 
1163 	if (ret)
1164 		clear_bit(__LINK_STATE_START, &dev->state);
1165 	else {
1166 		/*
1167 		 *	Set the flags.
1168 		 */
1169 		dev->flags |= IFF_UP;
1170 
1171 		/*
1172 		 *	Enable NET_DMA
1173 		 */
1174 		net_dmaengine_get();
1175 
1176 		/*
1177 		 *	Initialize multicasting status
1178 		 */
1179 		dev_set_rx_mode(dev);
1180 
1181 		/*
1182 		 *	Wakeup transmit queue engine
1183 		 */
1184 		dev_activate(dev);
1185 	}
1186 
1187 	return ret;
1188 }
1189 
1190 /**
1191  *	dev_open	- prepare an interface for use.
1192  *	@dev:	device to open
1193  *
1194  *	Takes a device from down to up state. The device's private open
1195  *	function is invoked and then the multicast lists are loaded. Finally
1196  *	the device is moved into the up state and a %NETDEV_UP message is
1197  *	sent to the netdev notifier chain.
1198  *
1199  *	Calling this function on an active interface is a nop. On a failure
1200  *	a negative errno code is returned.
1201  */
1202 int dev_open(struct net_device *dev)
1203 {
1204 	int ret;
1205 
1206 	/*
1207 	 *	Is it already up?
1208 	 */
1209 	if (dev->flags & IFF_UP)
1210 		return 0;
1211 
1212 	/*
1213 	 *	Open device
1214 	 */
1215 	ret = __dev_open(dev);
1216 	if (ret < 0)
1217 		return ret;
1218 
1219 	/*
1220 	 *	... and announce new interface.
1221 	 */
1222 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1223 	call_netdevice_notifiers(NETDEV_UP, dev);
1224 
1225 	return ret;
1226 }
1227 EXPORT_SYMBOL(dev_open);
1228 
1229 static int __dev_close(struct net_device *dev)
1230 {
1231 	const struct net_device_ops *ops = dev->netdev_ops;
1232 
1233 	ASSERT_RTNL();
1234 	might_sleep();
1235 
1236 	/*
1237 	 *	Tell people we are going down, so that they can
1238 	 *	prepare to death, when device is still operating.
1239 	 */
1240 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1241 
1242 	clear_bit(__LINK_STATE_START, &dev->state);
1243 
1244 	/* Synchronize to scheduled poll. We cannot touch poll list,
1245 	 * it can be even on different cpu. So just clear netif_running().
1246 	 *
1247 	 * dev->stop() will invoke napi_disable() on all of it's
1248 	 * napi_struct instances on this device.
1249 	 */
1250 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1251 
1252 	dev_deactivate(dev);
1253 
1254 	/*
1255 	 *	Call the device specific close. This cannot fail.
1256 	 *	Only if device is UP
1257 	 *
1258 	 *	We allow it to be called even after a DETACH hot-plug
1259 	 *	event.
1260 	 */
1261 	if (ops->ndo_stop)
1262 		ops->ndo_stop(dev);
1263 
1264 	/*
1265 	 *	Device is now down.
1266 	 */
1267 
1268 	dev->flags &= ~IFF_UP;
1269 
1270 	/*
1271 	 *	Shutdown NET_DMA
1272 	 */
1273 	net_dmaengine_put();
1274 
1275 	return 0;
1276 }
1277 
1278 /**
1279  *	dev_close - shutdown an interface.
1280  *	@dev: device to shutdown
1281  *
1282  *	This function moves an active device into down state. A
1283  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1284  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1285  *	chain.
1286  */
1287 int dev_close(struct net_device *dev)
1288 {
1289 	if (!(dev->flags & IFF_UP))
1290 		return 0;
1291 
1292 	__dev_close(dev);
1293 
1294 	/*
1295 	 * Tell people we are down
1296 	 */
1297 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1298 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1299 
1300 	return 0;
1301 }
1302 EXPORT_SYMBOL(dev_close);
1303 
1304 
1305 /**
1306  *	dev_disable_lro - disable Large Receive Offload on a device
1307  *	@dev: device
1308  *
1309  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1310  *	called under RTNL.  This is needed if received packets may be
1311  *	forwarded to another interface.
1312  */
1313 void dev_disable_lro(struct net_device *dev)
1314 {
1315 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1316 	    dev->ethtool_ops->set_flags) {
1317 		u32 flags = dev->ethtool_ops->get_flags(dev);
1318 		if (flags & ETH_FLAG_LRO) {
1319 			flags &= ~ETH_FLAG_LRO;
1320 			dev->ethtool_ops->set_flags(dev, flags);
1321 		}
1322 	}
1323 	WARN_ON(dev->features & NETIF_F_LRO);
1324 }
1325 EXPORT_SYMBOL(dev_disable_lro);
1326 
1327 
1328 static int dev_boot_phase = 1;
1329 
1330 /*
1331  *	Device change register/unregister. These are not inline or static
1332  *	as we export them to the world.
1333  */
1334 
1335 /**
1336  *	register_netdevice_notifier - register a network notifier block
1337  *	@nb: notifier
1338  *
1339  *	Register a notifier to be called when network device events occur.
1340  *	The notifier passed is linked into the kernel structures and must
1341  *	not be reused until it has been unregistered. A negative errno code
1342  *	is returned on a failure.
1343  *
1344  * 	When registered all registration and up events are replayed
1345  *	to the new notifier to allow device to have a race free
1346  *	view of the network device list.
1347  */
1348 
1349 int register_netdevice_notifier(struct notifier_block *nb)
1350 {
1351 	struct net_device *dev;
1352 	struct net_device *last;
1353 	struct net *net;
1354 	int err;
1355 
1356 	rtnl_lock();
1357 	err = raw_notifier_chain_register(&netdev_chain, nb);
1358 	if (err)
1359 		goto unlock;
1360 	if (dev_boot_phase)
1361 		goto unlock;
1362 	for_each_net(net) {
1363 		for_each_netdev(net, dev) {
1364 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1365 			err = notifier_to_errno(err);
1366 			if (err)
1367 				goto rollback;
1368 
1369 			if (!(dev->flags & IFF_UP))
1370 				continue;
1371 
1372 			nb->notifier_call(nb, NETDEV_UP, dev);
1373 		}
1374 	}
1375 
1376 unlock:
1377 	rtnl_unlock();
1378 	return err;
1379 
1380 rollback:
1381 	last = dev;
1382 	for_each_net(net) {
1383 		for_each_netdev(net, dev) {
1384 			if (dev == last)
1385 				break;
1386 
1387 			if (dev->flags & IFF_UP) {
1388 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1389 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1390 			}
1391 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1392 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1393 		}
1394 	}
1395 
1396 	raw_notifier_chain_unregister(&netdev_chain, nb);
1397 	goto unlock;
1398 }
1399 EXPORT_SYMBOL(register_netdevice_notifier);
1400 
1401 /**
1402  *	unregister_netdevice_notifier - unregister a network notifier block
1403  *	@nb: notifier
1404  *
1405  *	Unregister a notifier previously registered by
1406  *	register_netdevice_notifier(). The notifier is unlinked into the
1407  *	kernel structures and may then be reused. A negative errno code
1408  *	is returned on a failure.
1409  */
1410 
1411 int unregister_netdevice_notifier(struct notifier_block *nb)
1412 {
1413 	int err;
1414 
1415 	rtnl_lock();
1416 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1417 	rtnl_unlock();
1418 	return err;
1419 }
1420 EXPORT_SYMBOL(unregister_netdevice_notifier);
1421 
1422 /**
1423  *	call_netdevice_notifiers - call all network notifier blocks
1424  *      @val: value passed unmodified to notifier function
1425  *      @dev: net_device pointer passed unmodified to notifier function
1426  *
1427  *	Call all network notifier blocks.  Parameters and return value
1428  *	are as for raw_notifier_call_chain().
1429  */
1430 
1431 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1432 {
1433 	ASSERT_RTNL();
1434 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1435 }
1436 
1437 /* When > 0 there are consumers of rx skb time stamps */
1438 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1439 
1440 void net_enable_timestamp(void)
1441 {
1442 	atomic_inc(&netstamp_needed);
1443 }
1444 EXPORT_SYMBOL(net_enable_timestamp);
1445 
1446 void net_disable_timestamp(void)
1447 {
1448 	atomic_dec(&netstamp_needed);
1449 }
1450 EXPORT_SYMBOL(net_disable_timestamp);
1451 
1452 static inline void net_timestamp_set(struct sk_buff *skb)
1453 {
1454 	if (atomic_read(&netstamp_needed))
1455 		__net_timestamp(skb);
1456 	else
1457 		skb->tstamp.tv64 = 0;
1458 }
1459 
1460 static inline void net_timestamp_check(struct sk_buff *skb)
1461 {
1462 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1463 		__net_timestamp(skb);
1464 }
1465 
1466 /**
1467  * dev_forward_skb - loopback an skb to another netif
1468  *
1469  * @dev: destination network device
1470  * @skb: buffer to forward
1471  *
1472  * return values:
1473  *	NET_RX_SUCCESS	(no congestion)
1474  *	NET_RX_DROP     (packet was dropped, but freed)
1475  *
1476  * dev_forward_skb can be used for injecting an skb from the
1477  * start_xmit function of one device into the receive queue
1478  * of another device.
1479  *
1480  * The receiving device may be in another namespace, so
1481  * we have to clear all information in the skb that could
1482  * impact namespace isolation.
1483  */
1484 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1485 {
1486 	skb_orphan(skb);
1487 
1488 	if (!(dev->flags & IFF_UP) ||
1489 	    (skb->len > (dev->mtu + dev->hard_header_len))) {
1490 		kfree_skb(skb);
1491 		return NET_RX_DROP;
1492 	}
1493 	skb_set_dev(skb, dev);
1494 	skb->tstamp.tv64 = 0;
1495 	skb->pkt_type = PACKET_HOST;
1496 	skb->protocol = eth_type_trans(skb, dev);
1497 	return netif_rx(skb);
1498 }
1499 EXPORT_SYMBOL_GPL(dev_forward_skb);
1500 
1501 /*
1502  *	Support routine. Sends outgoing frames to any network
1503  *	taps currently in use.
1504  */
1505 
1506 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1507 {
1508 	struct packet_type *ptype;
1509 
1510 #ifdef CONFIG_NET_CLS_ACT
1511 	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1512 		net_timestamp_set(skb);
1513 #else
1514 	net_timestamp_set(skb);
1515 #endif
1516 
1517 	rcu_read_lock();
1518 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1519 		/* Never send packets back to the socket
1520 		 * they originated from - MvS (miquels@drinkel.ow.org)
1521 		 */
1522 		if ((ptype->dev == dev || !ptype->dev) &&
1523 		    (ptype->af_packet_priv == NULL ||
1524 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1525 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1526 			if (!skb2)
1527 				break;
1528 
1529 			/* skb->nh should be correctly
1530 			   set by sender, so that the second statement is
1531 			   just protection against buggy protocols.
1532 			 */
1533 			skb_reset_mac_header(skb2);
1534 
1535 			if (skb_network_header(skb2) < skb2->data ||
1536 			    skb2->network_header > skb2->tail) {
1537 				if (net_ratelimit())
1538 					printk(KERN_CRIT "protocol %04x is "
1539 					       "buggy, dev %s\n",
1540 					       skb2->protocol, dev->name);
1541 				skb_reset_network_header(skb2);
1542 			}
1543 
1544 			skb2->transport_header = skb2->network_header;
1545 			skb2->pkt_type = PACKET_OUTGOING;
1546 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1547 		}
1548 	}
1549 	rcu_read_unlock();
1550 }
1551 
1552 
1553 static inline void __netif_reschedule(struct Qdisc *q)
1554 {
1555 	struct softnet_data *sd;
1556 	unsigned long flags;
1557 
1558 	local_irq_save(flags);
1559 	sd = &__get_cpu_var(softnet_data);
1560 	q->next_sched = NULL;
1561 	*sd->output_queue_tailp = q;
1562 	sd->output_queue_tailp = &q->next_sched;
1563 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1564 	local_irq_restore(flags);
1565 }
1566 
1567 void __netif_schedule(struct Qdisc *q)
1568 {
1569 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1570 		__netif_reschedule(q);
1571 }
1572 EXPORT_SYMBOL(__netif_schedule);
1573 
1574 void dev_kfree_skb_irq(struct sk_buff *skb)
1575 {
1576 	if (atomic_dec_and_test(&skb->users)) {
1577 		struct softnet_data *sd;
1578 		unsigned long flags;
1579 
1580 		local_irq_save(flags);
1581 		sd = &__get_cpu_var(softnet_data);
1582 		skb->next = sd->completion_queue;
1583 		sd->completion_queue = skb;
1584 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1585 		local_irq_restore(flags);
1586 	}
1587 }
1588 EXPORT_SYMBOL(dev_kfree_skb_irq);
1589 
1590 void dev_kfree_skb_any(struct sk_buff *skb)
1591 {
1592 	if (in_irq() || irqs_disabled())
1593 		dev_kfree_skb_irq(skb);
1594 	else
1595 		dev_kfree_skb(skb);
1596 }
1597 EXPORT_SYMBOL(dev_kfree_skb_any);
1598 
1599 
1600 /**
1601  * netif_device_detach - mark device as removed
1602  * @dev: network device
1603  *
1604  * Mark device as removed from system and therefore no longer available.
1605  */
1606 void netif_device_detach(struct net_device *dev)
1607 {
1608 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1609 	    netif_running(dev)) {
1610 		netif_tx_stop_all_queues(dev);
1611 	}
1612 }
1613 EXPORT_SYMBOL(netif_device_detach);
1614 
1615 /**
1616  * netif_device_attach - mark device as attached
1617  * @dev: network device
1618  *
1619  * Mark device as attached from system and restart if needed.
1620  */
1621 void netif_device_attach(struct net_device *dev)
1622 {
1623 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1624 	    netif_running(dev)) {
1625 		netif_tx_wake_all_queues(dev);
1626 		__netdev_watchdog_up(dev);
1627 	}
1628 }
1629 EXPORT_SYMBOL(netif_device_attach);
1630 
1631 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1632 {
1633 	return ((features & NETIF_F_GEN_CSUM) ||
1634 		((features & NETIF_F_IP_CSUM) &&
1635 		 protocol == htons(ETH_P_IP)) ||
1636 		((features & NETIF_F_IPV6_CSUM) &&
1637 		 protocol == htons(ETH_P_IPV6)) ||
1638 		((features & NETIF_F_FCOE_CRC) &&
1639 		 protocol == htons(ETH_P_FCOE)));
1640 }
1641 
1642 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1643 {
1644 	if (can_checksum_protocol(dev->features, skb->protocol))
1645 		return true;
1646 
1647 	if (skb->protocol == htons(ETH_P_8021Q)) {
1648 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1649 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1650 					  veh->h_vlan_encapsulated_proto))
1651 			return true;
1652 	}
1653 
1654 	return false;
1655 }
1656 
1657 /**
1658  * skb_dev_set -- assign a new device to a buffer
1659  * @skb: buffer for the new device
1660  * @dev: network device
1661  *
1662  * If an skb is owned by a device already, we have to reset
1663  * all data private to the namespace a device belongs to
1664  * before assigning it a new device.
1665  */
1666 #ifdef CONFIG_NET_NS
1667 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1668 {
1669 	skb_dst_drop(skb);
1670 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1671 		secpath_reset(skb);
1672 		nf_reset(skb);
1673 		skb_init_secmark(skb);
1674 		skb->mark = 0;
1675 		skb->priority = 0;
1676 		skb->nf_trace = 0;
1677 		skb->ipvs_property = 0;
1678 #ifdef CONFIG_NET_SCHED
1679 		skb->tc_index = 0;
1680 #endif
1681 	}
1682 	skb->dev = dev;
1683 }
1684 EXPORT_SYMBOL(skb_set_dev);
1685 #endif /* CONFIG_NET_NS */
1686 
1687 /*
1688  * Invalidate hardware checksum when packet is to be mangled, and
1689  * complete checksum manually on outgoing path.
1690  */
1691 int skb_checksum_help(struct sk_buff *skb)
1692 {
1693 	__wsum csum;
1694 	int ret = 0, offset;
1695 
1696 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1697 		goto out_set_summed;
1698 
1699 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1700 		/* Let GSO fix up the checksum. */
1701 		goto out_set_summed;
1702 	}
1703 
1704 	offset = skb->csum_start - skb_headroom(skb);
1705 	BUG_ON(offset >= skb_headlen(skb));
1706 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1707 
1708 	offset += skb->csum_offset;
1709 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1710 
1711 	if (skb_cloned(skb) &&
1712 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1713 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1714 		if (ret)
1715 			goto out;
1716 	}
1717 
1718 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1719 out_set_summed:
1720 	skb->ip_summed = CHECKSUM_NONE;
1721 out:
1722 	return ret;
1723 }
1724 EXPORT_SYMBOL(skb_checksum_help);
1725 
1726 /**
1727  *	skb_gso_segment - Perform segmentation on skb.
1728  *	@skb: buffer to segment
1729  *	@features: features for the output path (see dev->features)
1730  *
1731  *	This function segments the given skb and returns a list of segments.
1732  *
1733  *	It may return NULL if the skb requires no segmentation.  This is
1734  *	only possible when GSO is used for verifying header integrity.
1735  */
1736 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1737 {
1738 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1739 	struct packet_type *ptype;
1740 	__be16 type = skb->protocol;
1741 	int err;
1742 
1743 	skb_reset_mac_header(skb);
1744 	skb->mac_len = skb->network_header - skb->mac_header;
1745 	__skb_pull(skb, skb->mac_len);
1746 
1747 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1748 		struct net_device *dev = skb->dev;
1749 		struct ethtool_drvinfo info = {};
1750 
1751 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1752 			dev->ethtool_ops->get_drvinfo(dev, &info);
1753 
1754 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1755 			"ip_summed=%d",
1756 		     info.driver, dev ? dev->features : 0L,
1757 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1758 		     skb->len, skb->data_len, skb->ip_summed);
1759 
1760 		if (skb_header_cloned(skb) &&
1761 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1762 			return ERR_PTR(err);
1763 	}
1764 
1765 	rcu_read_lock();
1766 	list_for_each_entry_rcu(ptype,
1767 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1768 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1769 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1770 				err = ptype->gso_send_check(skb);
1771 				segs = ERR_PTR(err);
1772 				if (err || skb_gso_ok(skb, features))
1773 					break;
1774 				__skb_push(skb, (skb->data -
1775 						 skb_network_header(skb)));
1776 			}
1777 			segs = ptype->gso_segment(skb, features);
1778 			break;
1779 		}
1780 	}
1781 	rcu_read_unlock();
1782 
1783 	__skb_push(skb, skb->data - skb_mac_header(skb));
1784 
1785 	return segs;
1786 }
1787 EXPORT_SYMBOL(skb_gso_segment);
1788 
1789 /* Take action when hardware reception checksum errors are detected. */
1790 #ifdef CONFIG_BUG
1791 void netdev_rx_csum_fault(struct net_device *dev)
1792 {
1793 	if (net_ratelimit()) {
1794 		printk(KERN_ERR "%s: hw csum failure.\n",
1795 			dev ? dev->name : "<unknown>");
1796 		dump_stack();
1797 	}
1798 }
1799 EXPORT_SYMBOL(netdev_rx_csum_fault);
1800 #endif
1801 
1802 /* Actually, we should eliminate this check as soon as we know, that:
1803  * 1. IOMMU is present and allows to map all the memory.
1804  * 2. No high memory really exists on this machine.
1805  */
1806 
1807 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1808 {
1809 #ifdef CONFIG_HIGHMEM
1810 	int i;
1811 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1812 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1813 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1814 				return 1;
1815 	}
1816 
1817 	if (PCI_DMA_BUS_IS_PHYS) {
1818 		struct device *pdev = dev->dev.parent;
1819 
1820 		if (!pdev)
1821 			return 0;
1822 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1823 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1824 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1825 				return 1;
1826 		}
1827 	}
1828 #endif
1829 	return 0;
1830 }
1831 
1832 struct dev_gso_cb {
1833 	void (*destructor)(struct sk_buff *skb);
1834 };
1835 
1836 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1837 
1838 static void dev_gso_skb_destructor(struct sk_buff *skb)
1839 {
1840 	struct dev_gso_cb *cb;
1841 
1842 	do {
1843 		struct sk_buff *nskb = skb->next;
1844 
1845 		skb->next = nskb->next;
1846 		nskb->next = NULL;
1847 		kfree_skb(nskb);
1848 	} while (skb->next);
1849 
1850 	cb = DEV_GSO_CB(skb);
1851 	if (cb->destructor)
1852 		cb->destructor(skb);
1853 }
1854 
1855 /**
1856  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1857  *	@skb: buffer to segment
1858  *
1859  *	This function segments the given skb and stores the list of segments
1860  *	in skb->next.
1861  */
1862 static int dev_gso_segment(struct sk_buff *skb)
1863 {
1864 	struct net_device *dev = skb->dev;
1865 	struct sk_buff *segs;
1866 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1867 					 NETIF_F_SG : 0);
1868 
1869 	segs = skb_gso_segment(skb, features);
1870 
1871 	/* Verifying header integrity only. */
1872 	if (!segs)
1873 		return 0;
1874 
1875 	if (IS_ERR(segs))
1876 		return PTR_ERR(segs);
1877 
1878 	skb->next = segs;
1879 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1880 	skb->destructor = dev_gso_skb_destructor;
1881 
1882 	return 0;
1883 }
1884 
1885 /*
1886  * Try to orphan skb early, right before transmission by the device.
1887  * We cannot orphan skb if tx timestamp is requested, since
1888  * drivers need to call skb_tstamp_tx() to send the timestamp.
1889  */
1890 static inline void skb_orphan_try(struct sk_buff *skb)
1891 {
1892 	if (!skb_tx(skb)->flags)
1893 		skb_orphan(skb);
1894 }
1895 
1896 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1897 			struct netdev_queue *txq)
1898 {
1899 	const struct net_device_ops *ops = dev->netdev_ops;
1900 	int rc = NETDEV_TX_OK;
1901 
1902 	if (likely(!skb->next)) {
1903 		if (!list_empty(&ptype_all))
1904 			dev_queue_xmit_nit(skb, dev);
1905 
1906 		/*
1907 		 * If device doesnt need skb->dst, release it right now while
1908 		 * its hot in this cpu cache
1909 		 */
1910 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1911 			skb_dst_drop(skb);
1912 
1913 		skb_orphan_try(skb);
1914 
1915 		if (netif_needs_gso(dev, skb)) {
1916 			if (unlikely(dev_gso_segment(skb)))
1917 				goto out_kfree_skb;
1918 			if (skb->next)
1919 				goto gso;
1920 		}
1921 
1922 		rc = ops->ndo_start_xmit(skb, dev);
1923 		if (rc == NETDEV_TX_OK)
1924 			txq_trans_update(txq);
1925 		return rc;
1926 	}
1927 
1928 gso:
1929 	do {
1930 		struct sk_buff *nskb = skb->next;
1931 
1932 		skb->next = nskb->next;
1933 		nskb->next = NULL;
1934 
1935 		/*
1936 		 * If device doesnt need nskb->dst, release it right now while
1937 		 * its hot in this cpu cache
1938 		 */
1939 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1940 			skb_dst_drop(nskb);
1941 
1942 		rc = ops->ndo_start_xmit(nskb, dev);
1943 		if (unlikely(rc != NETDEV_TX_OK)) {
1944 			if (rc & ~NETDEV_TX_MASK)
1945 				goto out_kfree_gso_skb;
1946 			nskb->next = skb->next;
1947 			skb->next = nskb;
1948 			return rc;
1949 		}
1950 		txq_trans_update(txq);
1951 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1952 			return NETDEV_TX_BUSY;
1953 	} while (skb->next);
1954 
1955 out_kfree_gso_skb:
1956 	if (likely(skb->next == NULL))
1957 		skb->destructor = DEV_GSO_CB(skb)->destructor;
1958 out_kfree_skb:
1959 	kfree_skb(skb);
1960 	return rc;
1961 }
1962 
1963 static u32 hashrnd __read_mostly;
1964 
1965 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1966 {
1967 	u32 hash;
1968 
1969 	if (skb_rx_queue_recorded(skb)) {
1970 		hash = skb_get_rx_queue(skb);
1971 		while (unlikely(hash >= dev->real_num_tx_queues))
1972 			hash -= dev->real_num_tx_queues;
1973 		return hash;
1974 	}
1975 
1976 	if (skb->sk && skb->sk->sk_hash)
1977 		hash = skb->sk->sk_hash;
1978 	else
1979 		hash = (__force u16) skb->protocol;
1980 
1981 	hash = jhash_1word(hash, hashrnd);
1982 
1983 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1984 }
1985 EXPORT_SYMBOL(skb_tx_hash);
1986 
1987 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1988 {
1989 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1990 		if (net_ratelimit()) {
1991 			pr_warning("%s selects TX queue %d, but "
1992 				"real number of TX queues is %d\n",
1993 				dev->name, queue_index, dev->real_num_tx_queues);
1994 		}
1995 		return 0;
1996 	}
1997 	return queue_index;
1998 }
1999 
2000 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2001 					struct sk_buff *skb)
2002 {
2003 	u16 queue_index;
2004 	struct sock *sk = skb->sk;
2005 
2006 	if (sk_tx_queue_recorded(sk)) {
2007 		queue_index = sk_tx_queue_get(sk);
2008 	} else {
2009 		const struct net_device_ops *ops = dev->netdev_ops;
2010 
2011 		if (ops->ndo_select_queue) {
2012 			queue_index = ops->ndo_select_queue(dev, skb);
2013 			queue_index = dev_cap_txqueue(dev, queue_index);
2014 		} else {
2015 			queue_index = 0;
2016 			if (dev->real_num_tx_queues > 1)
2017 				queue_index = skb_tx_hash(dev, skb);
2018 
2019 			if (sk) {
2020 				struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2021 
2022 				if (dst && skb_dst(skb) == dst)
2023 					sk_tx_queue_set(sk, queue_index);
2024 			}
2025 		}
2026 	}
2027 
2028 	skb_set_queue_mapping(skb, queue_index);
2029 	return netdev_get_tx_queue(dev, queue_index);
2030 }
2031 
2032 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2033 				 struct net_device *dev,
2034 				 struct netdev_queue *txq)
2035 {
2036 	spinlock_t *root_lock = qdisc_lock(q);
2037 	int rc;
2038 
2039 	spin_lock(root_lock);
2040 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2041 		kfree_skb(skb);
2042 		rc = NET_XMIT_DROP;
2043 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2044 		   !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2045 		/*
2046 		 * This is a work-conserving queue; there are no old skbs
2047 		 * waiting to be sent out; and the qdisc is not running -
2048 		 * xmit the skb directly.
2049 		 */
2050 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2051 			skb_dst_force(skb);
2052 		__qdisc_update_bstats(q, skb->len);
2053 		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2054 			__qdisc_run(q);
2055 		else
2056 			clear_bit(__QDISC_STATE_RUNNING, &q->state);
2057 
2058 		rc = NET_XMIT_SUCCESS;
2059 	} else {
2060 		skb_dst_force(skb);
2061 		rc = qdisc_enqueue_root(skb, q);
2062 		qdisc_run(q);
2063 	}
2064 	spin_unlock(root_lock);
2065 
2066 	return rc;
2067 }
2068 
2069 /*
2070  * Returns true if either:
2071  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2072  *	2. skb is fragmented and the device does not support SG, or if
2073  *	   at least one of fragments is in highmem and device does not
2074  *	   support DMA from it.
2075  */
2076 static inline int skb_needs_linearize(struct sk_buff *skb,
2077 				      struct net_device *dev)
2078 {
2079 	return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2080 	       (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2081 					      illegal_highdma(dev, skb)));
2082 }
2083 
2084 /**
2085  *	dev_queue_xmit - transmit a buffer
2086  *	@skb: buffer to transmit
2087  *
2088  *	Queue a buffer for transmission to a network device. The caller must
2089  *	have set the device and priority and built the buffer before calling
2090  *	this function. The function can be called from an interrupt.
2091  *
2092  *	A negative errno code is returned on a failure. A success does not
2093  *	guarantee the frame will be transmitted as it may be dropped due
2094  *	to congestion or traffic shaping.
2095  *
2096  * -----------------------------------------------------------------------------------
2097  *      I notice this method can also return errors from the queue disciplines,
2098  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2099  *      be positive.
2100  *
2101  *      Regardless of the return value, the skb is consumed, so it is currently
2102  *      difficult to retry a send to this method.  (You can bump the ref count
2103  *      before sending to hold a reference for retry if you are careful.)
2104  *
2105  *      When calling this method, interrupts MUST be enabled.  This is because
2106  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2107  *          --BLG
2108  */
2109 int dev_queue_xmit(struct sk_buff *skb)
2110 {
2111 	struct net_device *dev = skb->dev;
2112 	struct netdev_queue *txq;
2113 	struct Qdisc *q;
2114 	int rc = -ENOMEM;
2115 
2116 	/* GSO will handle the following emulations directly. */
2117 	if (netif_needs_gso(dev, skb))
2118 		goto gso;
2119 
2120 	/* Convert a paged skb to linear, if required */
2121 	if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2122 		goto out_kfree_skb;
2123 
2124 	/* If packet is not checksummed and device does not support
2125 	 * checksumming for this protocol, complete checksumming here.
2126 	 */
2127 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
2128 		skb_set_transport_header(skb, skb->csum_start -
2129 					      skb_headroom(skb));
2130 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2131 			goto out_kfree_skb;
2132 	}
2133 
2134 gso:
2135 	/* Disable soft irqs for various locks below. Also
2136 	 * stops preemption for RCU.
2137 	 */
2138 	rcu_read_lock_bh();
2139 
2140 	txq = dev_pick_tx(dev, skb);
2141 	q = rcu_dereference_bh(txq->qdisc);
2142 
2143 #ifdef CONFIG_NET_CLS_ACT
2144 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2145 #endif
2146 	if (q->enqueue) {
2147 		rc = __dev_xmit_skb(skb, q, dev, txq);
2148 		goto out;
2149 	}
2150 
2151 	/* The device has no queue. Common case for software devices:
2152 	   loopback, all the sorts of tunnels...
2153 
2154 	   Really, it is unlikely that netif_tx_lock protection is necessary
2155 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2156 	   counters.)
2157 	   However, it is possible, that they rely on protection
2158 	   made by us here.
2159 
2160 	   Check this and shot the lock. It is not prone from deadlocks.
2161 	   Either shot noqueue qdisc, it is even simpler 8)
2162 	 */
2163 	if (dev->flags & IFF_UP) {
2164 		int cpu = smp_processor_id(); /* ok because BHs are off */
2165 
2166 		if (txq->xmit_lock_owner != cpu) {
2167 
2168 			HARD_TX_LOCK(dev, txq, cpu);
2169 
2170 			if (!netif_tx_queue_stopped(txq)) {
2171 				rc = dev_hard_start_xmit(skb, dev, txq);
2172 				if (dev_xmit_complete(rc)) {
2173 					HARD_TX_UNLOCK(dev, txq);
2174 					goto out;
2175 				}
2176 			}
2177 			HARD_TX_UNLOCK(dev, txq);
2178 			if (net_ratelimit())
2179 				printk(KERN_CRIT "Virtual device %s asks to "
2180 				       "queue packet!\n", dev->name);
2181 		} else {
2182 			/* Recursion is detected! It is possible,
2183 			 * unfortunately */
2184 			if (net_ratelimit())
2185 				printk(KERN_CRIT "Dead loop on virtual device "
2186 				       "%s, fix it urgently!\n", dev->name);
2187 		}
2188 	}
2189 
2190 	rc = -ENETDOWN;
2191 	rcu_read_unlock_bh();
2192 
2193 out_kfree_skb:
2194 	kfree_skb(skb);
2195 	return rc;
2196 out:
2197 	rcu_read_unlock_bh();
2198 	return rc;
2199 }
2200 EXPORT_SYMBOL(dev_queue_xmit);
2201 
2202 
2203 /*=======================================================================
2204 			Receiver routines
2205   =======================================================================*/
2206 
2207 int netdev_max_backlog __read_mostly = 1000;
2208 int netdev_tstamp_prequeue __read_mostly = 1;
2209 int netdev_budget __read_mostly = 300;
2210 int weight_p __read_mostly = 64;            /* old backlog weight */
2211 
2212 /* Called with irq disabled */
2213 static inline void ____napi_schedule(struct softnet_data *sd,
2214 				     struct napi_struct *napi)
2215 {
2216 	list_add_tail(&napi->poll_list, &sd->poll_list);
2217 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2218 }
2219 
2220 #ifdef CONFIG_RPS
2221 
2222 /* One global table that all flow-based protocols share. */
2223 struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2224 EXPORT_SYMBOL(rps_sock_flow_table);
2225 
2226 /*
2227  * get_rps_cpu is called from netif_receive_skb and returns the target
2228  * CPU from the RPS map of the receiving queue for a given skb.
2229  * rcu_read_lock must be held on entry.
2230  */
2231 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2232 		       struct rps_dev_flow **rflowp)
2233 {
2234 	struct ipv6hdr *ip6;
2235 	struct iphdr *ip;
2236 	struct netdev_rx_queue *rxqueue;
2237 	struct rps_map *map;
2238 	struct rps_dev_flow_table *flow_table;
2239 	struct rps_sock_flow_table *sock_flow_table;
2240 	int cpu = -1;
2241 	u8 ip_proto;
2242 	u16 tcpu;
2243 	u32 addr1, addr2, ihl;
2244 	union {
2245 		u32 v32;
2246 		u16 v16[2];
2247 	} ports;
2248 
2249 	if (skb_rx_queue_recorded(skb)) {
2250 		u16 index = skb_get_rx_queue(skb);
2251 		if (unlikely(index >= dev->num_rx_queues)) {
2252 			if (net_ratelimit()) {
2253 				pr_warning("%s received packet on queue "
2254 					"%u, but number of RX queues is %u\n",
2255 					dev->name, index, dev->num_rx_queues);
2256 			}
2257 			goto done;
2258 		}
2259 		rxqueue = dev->_rx + index;
2260 	} else
2261 		rxqueue = dev->_rx;
2262 
2263 	if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2264 		goto done;
2265 
2266 	if (skb->rxhash)
2267 		goto got_hash; /* Skip hash computation on packet header */
2268 
2269 	switch (skb->protocol) {
2270 	case __constant_htons(ETH_P_IP):
2271 		if (!pskb_may_pull(skb, sizeof(*ip)))
2272 			goto done;
2273 
2274 		ip = (struct iphdr *) skb->data;
2275 		ip_proto = ip->protocol;
2276 		addr1 = (__force u32) ip->saddr;
2277 		addr2 = (__force u32) ip->daddr;
2278 		ihl = ip->ihl;
2279 		break;
2280 	case __constant_htons(ETH_P_IPV6):
2281 		if (!pskb_may_pull(skb, sizeof(*ip6)))
2282 			goto done;
2283 
2284 		ip6 = (struct ipv6hdr *) skb->data;
2285 		ip_proto = ip6->nexthdr;
2286 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2287 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2288 		ihl = (40 >> 2);
2289 		break;
2290 	default:
2291 		goto done;
2292 	}
2293 	switch (ip_proto) {
2294 	case IPPROTO_TCP:
2295 	case IPPROTO_UDP:
2296 	case IPPROTO_DCCP:
2297 	case IPPROTO_ESP:
2298 	case IPPROTO_AH:
2299 	case IPPROTO_SCTP:
2300 	case IPPROTO_UDPLITE:
2301 		if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2302 			ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2303 			if (ports.v16[1] < ports.v16[0])
2304 				swap(ports.v16[0], ports.v16[1]);
2305 			break;
2306 		}
2307 	default:
2308 		ports.v32 = 0;
2309 		break;
2310 	}
2311 
2312 	/* get a consistent hash (same value on both flow directions) */
2313 	if (addr2 < addr1)
2314 		swap(addr1, addr2);
2315 	skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2316 	if (!skb->rxhash)
2317 		skb->rxhash = 1;
2318 
2319 got_hash:
2320 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2321 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2322 	if (flow_table && sock_flow_table) {
2323 		u16 next_cpu;
2324 		struct rps_dev_flow *rflow;
2325 
2326 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2327 		tcpu = rflow->cpu;
2328 
2329 		next_cpu = sock_flow_table->ents[skb->rxhash &
2330 		    sock_flow_table->mask];
2331 
2332 		/*
2333 		 * If the desired CPU (where last recvmsg was done) is
2334 		 * different from current CPU (one in the rx-queue flow
2335 		 * table entry), switch if one of the following holds:
2336 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2337 		 *   - Current CPU is offline.
2338 		 *   - The current CPU's queue tail has advanced beyond the
2339 		 *     last packet that was enqueued using this table entry.
2340 		 *     This guarantees that all previous packets for the flow
2341 		 *     have been dequeued, thus preserving in order delivery.
2342 		 */
2343 		if (unlikely(tcpu != next_cpu) &&
2344 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2345 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2346 		      rflow->last_qtail)) >= 0)) {
2347 			tcpu = rflow->cpu = next_cpu;
2348 			if (tcpu != RPS_NO_CPU)
2349 				rflow->last_qtail = per_cpu(softnet_data,
2350 				    tcpu).input_queue_head;
2351 		}
2352 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2353 			*rflowp = rflow;
2354 			cpu = tcpu;
2355 			goto done;
2356 		}
2357 	}
2358 
2359 	map = rcu_dereference(rxqueue->rps_map);
2360 	if (map) {
2361 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2362 
2363 		if (cpu_online(tcpu)) {
2364 			cpu = tcpu;
2365 			goto done;
2366 		}
2367 	}
2368 
2369 done:
2370 	return cpu;
2371 }
2372 
2373 /* Called from hardirq (IPI) context */
2374 static void rps_trigger_softirq(void *data)
2375 {
2376 	struct softnet_data *sd = data;
2377 
2378 	____napi_schedule(sd, &sd->backlog);
2379 	sd->received_rps++;
2380 }
2381 
2382 #endif /* CONFIG_RPS */
2383 
2384 /*
2385  * Check if this softnet_data structure is another cpu one
2386  * If yes, queue it to our IPI list and return 1
2387  * If no, return 0
2388  */
2389 static int rps_ipi_queued(struct softnet_data *sd)
2390 {
2391 #ifdef CONFIG_RPS
2392 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2393 
2394 	if (sd != mysd) {
2395 		sd->rps_ipi_next = mysd->rps_ipi_list;
2396 		mysd->rps_ipi_list = sd;
2397 
2398 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2399 		return 1;
2400 	}
2401 #endif /* CONFIG_RPS */
2402 	return 0;
2403 }
2404 
2405 /*
2406  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2407  * queue (may be a remote CPU queue).
2408  */
2409 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2410 			      unsigned int *qtail)
2411 {
2412 	struct softnet_data *sd;
2413 	unsigned long flags;
2414 
2415 	sd = &per_cpu(softnet_data, cpu);
2416 
2417 	local_irq_save(flags);
2418 
2419 	rps_lock(sd);
2420 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2421 		if (skb_queue_len(&sd->input_pkt_queue)) {
2422 enqueue:
2423 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2424 #ifdef CONFIG_RPS
2425 			*qtail = sd->input_queue_head +
2426 					skb_queue_len(&sd->input_pkt_queue);
2427 #endif
2428 			rps_unlock(sd);
2429 			local_irq_restore(flags);
2430 			return NET_RX_SUCCESS;
2431 		}
2432 
2433 		/* Schedule NAPI for backlog device
2434 		 * We can use non atomic operation since we own the queue lock
2435 		 */
2436 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2437 			if (!rps_ipi_queued(sd))
2438 				____napi_schedule(sd, &sd->backlog);
2439 		}
2440 		goto enqueue;
2441 	}
2442 
2443 	sd->dropped++;
2444 	rps_unlock(sd);
2445 
2446 	local_irq_restore(flags);
2447 
2448 	kfree_skb(skb);
2449 	return NET_RX_DROP;
2450 }
2451 
2452 /**
2453  *	netif_rx	-	post buffer to the network code
2454  *	@skb: buffer to post
2455  *
2456  *	This function receives a packet from a device driver and queues it for
2457  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2458  *	may be dropped during processing for congestion control or by the
2459  *	protocol layers.
2460  *
2461  *	return values:
2462  *	NET_RX_SUCCESS	(no congestion)
2463  *	NET_RX_DROP     (packet was dropped)
2464  *
2465  */
2466 
2467 int netif_rx(struct sk_buff *skb)
2468 {
2469 	int ret;
2470 
2471 	/* if netpoll wants it, pretend we never saw it */
2472 	if (netpoll_rx(skb))
2473 		return NET_RX_DROP;
2474 
2475 	if (netdev_tstamp_prequeue)
2476 		net_timestamp_check(skb);
2477 
2478 #ifdef CONFIG_RPS
2479 	{
2480 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2481 		int cpu;
2482 
2483 		rcu_read_lock();
2484 
2485 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2486 		if (cpu < 0)
2487 			cpu = smp_processor_id();
2488 
2489 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2490 
2491 		rcu_read_unlock();
2492 	}
2493 #else
2494 	{
2495 		unsigned int qtail;
2496 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2497 		put_cpu();
2498 	}
2499 #endif
2500 	return ret;
2501 }
2502 EXPORT_SYMBOL(netif_rx);
2503 
2504 int netif_rx_ni(struct sk_buff *skb)
2505 {
2506 	int err;
2507 
2508 	preempt_disable();
2509 	err = netif_rx(skb);
2510 	if (local_softirq_pending())
2511 		do_softirq();
2512 	preempt_enable();
2513 
2514 	return err;
2515 }
2516 EXPORT_SYMBOL(netif_rx_ni);
2517 
2518 static void net_tx_action(struct softirq_action *h)
2519 {
2520 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2521 
2522 	if (sd->completion_queue) {
2523 		struct sk_buff *clist;
2524 
2525 		local_irq_disable();
2526 		clist = sd->completion_queue;
2527 		sd->completion_queue = NULL;
2528 		local_irq_enable();
2529 
2530 		while (clist) {
2531 			struct sk_buff *skb = clist;
2532 			clist = clist->next;
2533 
2534 			WARN_ON(atomic_read(&skb->users));
2535 			__kfree_skb(skb);
2536 		}
2537 	}
2538 
2539 	if (sd->output_queue) {
2540 		struct Qdisc *head;
2541 
2542 		local_irq_disable();
2543 		head = sd->output_queue;
2544 		sd->output_queue = NULL;
2545 		sd->output_queue_tailp = &sd->output_queue;
2546 		local_irq_enable();
2547 
2548 		while (head) {
2549 			struct Qdisc *q = head;
2550 			spinlock_t *root_lock;
2551 
2552 			head = head->next_sched;
2553 
2554 			root_lock = qdisc_lock(q);
2555 			if (spin_trylock(root_lock)) {
2556 				smp_mb__before_clear_bit();
2557 				clear_bit(__QDISC_STATE_SCHED,
2558 					  &q->state);
2559 				qdisc_run(q);
2560 				spin_unlock(root_lock);
2561 			} else {
2562 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2563 					      &q->state)) {
2564 					__netif_reschedule(q);
2565 				} else {
2566 					smp_mb__before_clear_bit();
2567 					clear_bit(__QDISC_STATE_SCHED,
2568 						  &q->state);
2569 				}
2570 			}
2571 		}
2572 	}
2573 }
2574 
2575 static inline int deliver_skb(struct sk_buff *skb,
2576 			      struct packet_type *pt_prev,
2577 			      struct net_device *orig_dev)
2578 {
2579 	atomic_inc(&skb->users);
2580 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2581 }
2582 
2583 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2584 
2585 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2586 /* This hook is defined here for ATM LANE */
2587 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2588 			     unsigned char *addr) __read_mostly;
2589 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2590 #endif
2591 
2592 /*
2593  * If bridge module is loaded call bridging hook.
2594  *  returns NULL if packet was consumed.
2595  */
2596 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2597 					struct sk_buff *skb) __read_mostly;
2598 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2599 
2600 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2601 					    struct packet_type **pt_prev, int *ret,
2602 					    struct net_device *orig_dev)
2603 {
2604 	struct net_bridge_port *port;
2605 
2606 	if (skb->pkt_type == PACKET_LOOPBACK ||
2607 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2608 		return skb;
2609 
2610 	if (*pt_prev) {
2611 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2612 		*pt_prev = NULL;
2613 	}
2614 
2615 	return br_handle_frame_hook(port, skb);
2616 }
2617 #else
2618 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2619 #endif
2620 
2621 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2622 struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
2623 					     struct sk_buff *skb) __read_mostly;
2624 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2625 
2626 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2627 					     struct packet_type **pt_prev,
2628 					     int *ret,
2629 					     struct net_device *orig_dev)
2630 {
2631 	struct macvlan_port *port;
2632 
2633 	port = rcu_dereference(skb->dev->macvlan_port);
2634 	if (!port)
2635 		return skb;
2636 
2637 	if (*pt_prev) {
2638 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2639 		*pt_prev = NULL;
2640 	}
2641 	return macvlan_handle_frame_hook(port, skb);
2642 }
2643 #else
2644 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2645 #endif
2646 
2647 #ifdef CONFIG_NET_CLS_ACT
2648 /* TODO: Maybe we should just force sch_ingress to be compiled in
2649  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2650  * a compare and 2 stores extra right now if we dont have it on
2651  * but have CONFIG_NET_CLS_ACT
2652  * NOTE: This doesnt stop any functionality; if you dont have
2653  * the ingress scheduler, you just cant add policies on ingress.
2654  *
2655  */
2656 static int ing_filter(struct sk_buff *skb)
2657 {
2658 	struct net_device *dev = skb->dev;
2659 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2660 	struct netdev_queue *rxq;
2661 	int result = TC_ACT_OK;
2662 	struct Qdisc *q;
2663 
2664 	if (MAX_RED_LOOP < ttl++) {
2665 		printk(KERN_WARNING
2666 		       "Redir loop detected Dropping packet (%d->%d)\n",
2667 		       skb->skb_iif, dev->ifindex);
2668 		return TC_ACT_SHOT;
2669 	}
2670 
2671 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2672 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2673 
2674 	rxq = &dev->rx_queue;
2675 
2676 	q = rxq->qdisc;
2677 	if (q != &noop_qdisc) {
2678 		spin_lock(qdisc_lock(q));
2679 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2680 			result = qdisc_enqueue_root(skb, q);
2681 		spin_unlock(qdisc_lock(q));
2682 	}
2683 
2684 	return result;
2685 }
2686 
2687 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2688 					 struct packet_type **pt_prev,
2689 					 int *ret, struct net_device *orig_dev)
2690 {
2691 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2692 		goto out;
2693 
2694 	if (*pt_prev) {
2695 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2696 		*pt_prev = NULL;
2697 	} else {
2698 		/* Huh? Why does turning on AF_PACKET affect this? */
2699 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2700 	}
2701 
2702 	switch (ing_filter(skb)) {
2703 	case TC_ACT_SHOT:
2704 	case TC_ACT_STOLEN:
2705 		kfree_skb(skb);
2706 		return NULL;
2707 	}
2708 
2709 out:
2710 	skb->tc_verd = 0;
2711 	return skb;
2712 }
2713 #endif
2714 
2715 /*
2716  * 	netif_nit_deliver - deliver received packets to network taps
2717  * 	@skb: buffer
2718  *
2719  * 	This function is used to deliver incoming packets to network
2720  * 	taps. It should be used when the normal netif_receive_skb path
2721  * 	is bypassed, for example because of VLAN acceleration.
2722  */
2723 void netif_nit_deliver(struct sk_buff *skb)
2724 {
2725 	struct packet_type *ptype;
2726 
2727 	if (list_empty(&ptype_all))
2728 		return;
2729 
2730 	skb_reset_network_header(skb);
2731 	skb_reset_transport_header(skb);
2732 	skb->mac_len = skb->network_header - skb->mac_header;
2733 
2734 	rcu_read_lock();
2735 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2736 		if (!ptype->dev || ptype->dev == skb->dev)
2737 			deliver_skb(skb, ptype, skb->dev);
2738 	}
2739 	rcu_read_unlock();
2740 }
2741 
2742 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2743 					      struct net_device *master)
2744 {
2745 	if (skb->pkt_type == PACKET_HOST) {
2746 		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2747 
2748 		memcpy(dest, master->dev_addr, ETH_ALEN);
2749 	}
2750 }
2751 
2752 /* On bonding slaves other than the currently active slave, suppress
2753  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2754  * ARP on active-backup slaves with arp_validate enabled.
2755  */
2756 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2757 {
2758 	struct net_device *dev = skb->dev;
2759 
2760 	if (master->priv_flags & IFF_MASTER_ARPMON)
2761 		dev->last_rx = jiffies;
2762 
2763 	if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2764 		/* Do address unmangle. The local destination address
2765 		 * will be always the one master has. Provides the right
2766 		 * functionality in a bridge.
2767 		 */
2768 		skb_bond_set_mac_by_master(skb, master);
2769 	}
2770 
2771 	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2772 		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2773 		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2774 			return 0;
2775 
2776 		if (master->priv_flags & IFF_MASTER_ALB) {
2777 			if (skb->pkt_type != PACKET_BROADCAST &&
2778 			    skb->pkt_type != PACKET_MULTICAST)
2779 				return 0;
2780 		}
2781 		if (master->priv_flags & IFF_MASTER_8023AD &&
2782 		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2783 			return 0;
2784 
2785 		return 1;
2786 	}
2787 	return 0;
2788 }
2789 EXPORT_SYMBOL(__skb_bond_should_drop);
2790 
2791 static int __netif_receive_skb(struct sk_buff *skb)
2792 {
2793 	struct packet_type *ptype, *pt_prev;
2794 	struct net_device *orig_dev;
2795 	struct net_device *master;
2796 	struct net_device *null_or_orig;
2797 	struct net_device *null_or_bond;
2798 	int ret = NET_RX_DROP;
2799 	__be16 type;
2800 
2801 	if (!netdev_tstamp_prequeue)
2802 		net_timestamp_check(skb);
2803 
2804 	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2805 		return NET_RX_SUCCESS;
2806 
2807 	/* if we've gotten here through NAPI, check netpoll */
2808 	if (netpoll_receive_skb(skb))
2809 		return NET_RX_DROP;
2810 
2811 	if (!skb->skb_iif)
2812 		skb->skb_iif = skb->dev->ifindex;
2813 
2814 	null_or_orig = NULL;
2815 	orig_dev = skb->dev;
2816 	master = ACCESS_ONCE(orig_dev->master);
2817 	if (master) {
2818 		if (skb_bond_should_drop(skb, master))
2819 			null_or_orig = orig_dev; /* deliver only exact match */
2820 		else
2821 			skb->dev = master;
2822 	}
2823 
2824 	__get_cpu_var(softnet_data).processed++;
2825 
2826 	skb_reset_network_header(skb);
2827 	skb_reset_transport_header(skb);
2828 	skb->mac_len = skb->network_header - skb->mac_header;
2829 
2830 	pt_prev = NULL;
2831 
2832 	rcu_read_lock();
2833 
2834 #ifdef CONFIG_NET_CLS_ACT
2835 	if (skb->tc_verd & TC_NCLS) {
2836 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2837 		goto ncls;
2838 	}
2839 #endif
2840 
2841 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2842 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2843 		    ptype->dev == orig_dev) {
2844 			if (pt_prev)
2845 				ret = deliver_skb(skb, pt_prev, orig_dev);
2846 			pt_prev = ptype;
2847 		}
2848 	}
2849 
2850 #ifdef CONFIG_NET_CLS_ACT
2851 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2852 	if (!skb)
2853 		goto out;
2854 ncls:
2855 #endif
2856 
2857 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2858 	if (!skb)
2859 		goto out;
2860 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2861 	if (!skb)
2862 		goto out;
2863 
2864 	/*
2865 	 * Make sure frames received on VLAN interfaces stacked on
2866 	 * bonding interfaces still make their way to any base bonding
2867 	 * device that may have registered for a specific ptype.  The
2868 	 * handler may have to adjust skb->dev and orig_dev.
2869 	 */
2870 	null_or_bond = NULL;
2871 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2872 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2873 		null_or_bond = vlan_dev_real_dev(skb->dev);
2874 	}
2875 
2876 	type = skb->protocol;
2877 	list_for_each_entry_rcu(ptype,
2878 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2879 		if (ptype->type == type && (ptype->dev == null_or_orig ||
2880 		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
2881 		     ptype->dev == null_or_bond)) {
2882 			if (pt_prev)
2883 				ret = deliver_skb(skb, pt_prev, orig_dev);
2884 			pt_prev = ptype;
2885 		}
2886 	}
2887 
2888 	if (pt_prev) {
2889 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2890 	} else {
2891 		kfree_skb(skb);
2892 		/* Jamal, now you will not able to escape explaining
2893 		 * me how you were going to use this. :-)
2894 		 */
2895 		ret = NET_RX_DROP;
2896 	}
2897 
2898 out:
2899 	rcu_read_unlock();
2900 	return ret;
2901 }
2902 
2903 /**
2904  *	netif_receive_skb - process receive buffer from network
2905  *	@skb: buffer to process
2906  *
2907  *	netif_receive_skb() is the main receive data processing function.
2908  *	It always succeeds. The buffer may be dropped during processing
2909  *	for congestion control or by the protocol layers.
2910  *
2911  *	This function may only be called from softirq context and interrupts
2912  *	should be enabled.
2913  *
2914  *	Return values (usually ignored):
2915  *	NET_RX_SUCCESS: no congestion
2916  *	NET_RX_DROP: packet was dropped
2917  */
2918 int netif_receive_skb(struct sk_buff *skb)
2919 {
2920 	if (netdev_tstamp_prequeue)
2921 		net_timestamp_check(skb);
2922 
2923 #ifdef CONFIG_RPS
2924 	{
2925 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2926 		int cpu, ret;
2927 
2928 		rcu_read_lock();
2929 
2930 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2931 
2932 		if (cpu >= 0) {
2933 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2934 			rcu_read_unlock();
2935 		} else {
2936 			rcu_read_unlock();
2937 			ret = __netif_receive_skb(skb);
2938 		}
2939 
2940 		return ret;
2941 	}
2942 #else
2943 	return __netif_receive_skb(skb);
2944 #endif
2945 }
2946 EXPORT_SYMBOL(netif_receive_skb);
2947 
2948 /* Network device is going away, flush any packets still pending
2949  * Called with irqs disabled.
2950  */
2951 static void flush_backlog(void *arg)
2952 {
2953 	struct net_device *dev = arg;
2954 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2955 	struct sk_buff *skb, *tmp;
2956 
2957 	rps_lock(sd);
2958 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
2959 		if (skb->dev == dev) {
2960 			__skb_unlink(skb, &sd->input_pkt_queue);
2961 			kfree_skb(skb);
2962 			input_queue_head_add(sd, 1);
2963 		}
2964 	}
2965 	rps_unlock(sd);
2966 
2967 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
2968 		if (skb->dev == dev) {
2969 			__skb_unlink(skb, &sd->process_queue);
2970 			kfree_skb(skb);
2971 		}
2972 	}
2973 }
2974 
2975 static int napi_gro_complete(struct sk_buff *skb)
2976 {
2977 	struct packet_type *ptype;
2978 	__be16 type = skb->protocol;
2979 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2980 	int err = -ENOENT;
2981 
2982 	if (NAPI_GRO_CB(skb)->count == 1) {
2983 		skb_shinfo(skb)->gso_size = 0;
2984 		goto out;
2985 	}
2986 
2987 	rcu_read_lock();
2988 	list_for_each_entry_rcu(ptype, head, list) {
2989 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2990 			continue;
2991 
2992 		err = ptype->gro_complete(skb);
2993 		break;
2994 	}
2995 	rcu_read_unlock();
2996 
2997 	if (err) {
2998 		WARN_ON(&ptype->list == head);
2999 		kfree_skb(skb);
3000 		return NET_RX_SUCCESS;
3001 	}
3002 
3003 out:
3004 	return netif_receive_skb(skb);
3005 }
3006 
3007 static void napi_gro_flush(struct napi_struct *napi)
3008 {
3009 	struct sk_buff *skb, *next;
3010 
3011 	for (skb = napi->gro_list; skb; skb = next) {
3012 		next = skb->next;
3013 		skb->next = NULL;
3014 		napi_gro_complete(skb);
3015 	}
3016 
3017 	napi->gro_count = 0;
3018 	napi->gro_list = NULL;
3019 }
3020 
3021 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3022 {
3023 	struct sk_buff **pp = NULL;
3024 	struct packet_type *ptype;
3025 	__be16 type = skb->protocol;
3026 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3027 	int same_flow;
3028 	int mac_len;
3029 	enum gro_result ret;
3030 
3031 	if (!(skb->dev->features & NETIF_F_GRO))
3032 		goto normal;
3033 
3034 	if (skb_is_gso(skb) || skb_has_frags(skb))
3035 		goto normal;
3036 
3037 	rcu_read_lock();
3038 	list_for_each_entry_rcu(ptype, head, list) {
3039 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3040 			continue;
3041 
3042 		skb_set_network_header(skb, skb_gro_offset(skb));
3043 		mac_len = skb->network_header - skb->mac_header;
3044 		skb->mac_len = mac_len;
3045 		NAPI_GRO_CB(skb)->same_flow = 0;
3046 		NAPI_GRO_CB(skb)->flush = 0;
3047 		NAPI_GRO_CB(skb)->free = 0;
3048 
3049 		pp = ptype->gro_receive(&napi->gro_list, skb);
3050 		break;
3051 	}
3052 	rcu_read_unlock();
3053 
3054 	if (&ptype->list == head)
3055 		goto normal;
3056 
3057 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3058 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3059 
3060 	if (pp) {
3061 		struct sk_buff *nskb = *pp;
3062 
3063 		*pp = nskb->next;
3064 		nskb->next = NULL;
3065 		napi_gro_complete(nskb);
3066 		napi->gro_count--;
3067 	}
3068 
3069 	if (same_flow)
3070 		goto ok;
3071 
3072 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3073 		goto normal;
3074 
3075 	napi->gro_count++;
3076 	NAPI_GRO_CB(skb)->count = 1;
3077 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3078 	skb->next = napi->gro_list;
3079 	napi->gro_list = skb;
3080 	ret = GRO_HELD;
3081 
3082 pull:
3083 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3084 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3085 
3086 		BUG_ON(skb->end - skb->tail < grow);
3087 
3088 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3089 
3090 		skb->tail += grow;
3091 		skb->data_len -= grow;
3092 
3093 		skb_shinfo(skb)->frags[0].page_offset += grow;
3094 		skb_shinfo(skb)->frags[0].size -= grow;
3095 
3096 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3097 			put_page(skb_shinfo(skb)->frags[0].page);
3098 			memmove(skb_shinfo(skb)->frags,
3099 				skb_shinfo(skb)->frags + 1,
3100 				--skb_shinfo(skb)->nr_frags);
3101 		}
3102 	}
3103 
3104 ok:
3105 	return ret;
3106 
3107 normal:
3108 	ret = GRO_NORMAL;
3109 	goto pull;
3110 }
3111 EXPORT_SYMBOL(dev_gro_receive);
3112 
3113 static gro_result_t
3114 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3115 {
3116 	struct sk_buff *p;
3117 
3118 	if (netpoll_rx_on(skb))
3119 		return GRO_NORMAL;
3120 
3121 	for (p = napi->gro_list; p; p = p->next) {
3122 		NAPI_GRO_CB(p)->same_flow =
3123 			(p->dev == skb->dev) &&
3124 			!compare_ether_header(skb_mac_header(p),
3125 					      skb_gro_mac_header(skb));
3126 		NAPI_GRO_CB(p)->flush = 0;
3127 	}
3128 
3129 	return dev_gro_receive(napi, skb);
3130 }
3131 
3132 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3133 {
3134 	switch (ret) {
3135 	case GRO_NORMAL:
3136 		if (netif_receive_skb(skb))
3137 			ret = GRO_DROP;
3138 		break;
3139 
3140 	case GRO_DROP:
3141 	case GRO_MERGED_FREE:
3142 		kfree_skb(skb);
3143 		break;
3144 
3145 	case GRO_HELD:
3146 	case GRO_MERGED:
3147 		break;
3148 	}
3149 
3150 	return ret;
3151 }
3152 EXPORT_SYMBOL(napi_skb_finish);
3153 
3154 void skb_gro_reset_offset(struct sk_buff *skb)
3155 {
3156 	NAPI_GRO_CB(skb)->data_offset = 0;
3157 	NAPI_GRO_CB(skb)->frag0 = NULL;
3158 	NAPI_GRO_CB(skb)->frag0_len = 0;
3159 
3160 	if (skb->mac_header == skb->tail &&
3161 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3162 		NAPI_GRO_CB(skb)->frag0 =
3163 			page_address(skb_shinfo(skb)->frags[0].page) +
3164 			skb_shinfo(skb)->frags[0].page_offset;
3165 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3166 	}
3167 }
3168 EXPORT_SYMBOL(skb_gro_reset_offset);
3169 
3170 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3171 {
3172 	skb_gro_reset_offset(skb);
3173 
3174 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3175 }
3176 EXPORT_SYMBOL(napi_gro_receive);
3177 
3178 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3179 {
3180 	__skb_pull(skb, skb_headlen(skb));
3181 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3182 
3183 	napi->skb = skb;
3184 }
3185 EXPORT_SYMBOL(napi_reuse_skb);
3186 
3187 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3188 {
3189 	struct sk_buff *skb = napi->skb;
3190 
3191 	if (!skb) {
3192 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3193 		if (skb)
3194 			napi->skb = skb;
3195 	}
3196 	return skb;
3197 }
3198 EXPORT_SYMBOL(napi_get_frags);
3199 
3200 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3201 			       gro_result_t ret)
3202 {
3203 	switch (ret) {
3204 	case GRO_NORMAL:
3205 	case GRO_HELD:
3206 		skb->protocol = eth_type_trans(skb, skb->dev);
3207 
3208 		if (ret == GRO_HELD)
3209 			skb_gro_pull(skb, -ETH_HLEN);
3210 		else if (netif_receive_skb(skb))
3211 			ret = GRO_DROP;
3212 		break;
3213 
3214 	case GRO_DROP:
3215 	case GRO_MERGED_FREE:
3216 		napi_reuse_skb(napi, skb);
3217 		break;
3218 
3219 	case GRO_MERGED:
3220 		break;
3221 	}
3222 
3223 	return ret;
3224 }
3225 EXPORT_SYMBOL(napi_frags_finish);
3226 
3227 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3228 {
3229 	struct sk_buff *skb = napi->skb;
3230 	struct ethhdr *eth;
3231 	unsigned int hlen;
3232 	unsigned int off;
3233 
3234 	napi->skb = NULL;
3235 
3236 	skb_reset_mac_header(skb);
3237 	skb_gro_reset_offset(skb);
3238 
3239 	off = skb_gro_offset(skb);
3240 	hlen = off + sizeof(*eth);
3241 	eth = skb_gro_header_fast(skb, off);
3242 	if (skb_gro_header_hard(skb, hlen)) {
3243 		eth = skb_gro_header_slow(skb, hlen, off);
3244 		if (unlikely(!eth)) {
3245 			napi_reuse_skb(napi, skb);
3246 			skb = NULL;
3247 			goto out;
3248 		}
3249 	}
3250 
3251 	skb_gro_pull(skb, sizeof(*eth));
3252 
3253 	/*
3254 	 * This works because the only protocols we care about don't require
3255 	 * special handling.  We'll fix it up properly at the end.
3256 	 */
3257 	skb->protocol = eth->h_proto;
3258 
3259 out:
3260 	return skb;
3261 }
3262 EXPORT_SYMBOL(napi_frags_skb);
3263 
3264 gro_result_t napi_gro_frags(struct napi_struct *napi)
3265 {
3266 	struct sk_buff *skb = napi_frags_skb(napi);
3267 
3268 	if (!skb)
3269 		return GRO_DROP;
3270 
3271 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3272 }
3273 EXPORT_SYMBOL(napi_gro_frags);
3274 
3275 /*
3276  * net_rps_action sends any pending IPI's for rps.
3277  * Note: called with local irq disabled, but exits with local irq enabled.
3278  */
3279 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3280 {
3281 #ifdef CONFIG_RPS
3282 	struct softnet_data *remsd = sd->rps_ipi_list;
3283 
3284 	if (remsd) {
3285 		sd->rps_ipi_list = NULL;
3286 
3287 		local_irq_enable();
3288 
3289 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3290 		while (remsd) {
3291 			struct softnet_data *next = remsd->rps_ipi_next;
3292 
3293 			if (cpu_online(remsd->cpu))
3294 				__smp_call_function_single(remsd->cpu,
3295 							   &remsd->csd, 0);
3296 			remsd = next;
3297 		}
3298 	} else
3299 #endif
3300 		local_irq_enable();
3301 }
3302 
3303 static int process_backlog(struct napi_struct *napi, int quota)
3304 {
3305 	int work = 0;
3306 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3307 
3308 #ifdef CONFIG_RPS
3309 	/* Check if we have pending ipi, its better to send them now,
3310 	 * not waiting net_rx_action() end.
3311 	 */
3312 	if (sd->rps_ipi_list) {
3313 		local_irq_disable();
3314 		net_rps_action_and_irq_enable(sd);
3315 	}
3316 #endif
3317 	napi->weight = weight_p;
3318 	local_irq_disable();
3319 	while (work < quota) {
3320 		struct sk_buff *skb;
3321 		unsigned int qlen;
3322 
3323 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3324 			local_irq_enable();
3325 			__netif_receive_skb(skb);
3326 			if (++work >= quota)
3327 				return work;
3328 			local_irq_disable();
3329 		}
3330 
3331 		rps_lock(sd);
3332 		qlen = skb_queue_len(&sd->input_pkt_queue);
3333 		if (qlen) {
3334 			input_queue_head_add(sd, qlen);
3335 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3336 						   &sd->process_queue);
3337 		}
3338 		if (qlen < quota - work) {
3339 			/*
3340 			 * Inline a custom version of __napi_complete().
3341 			 * only current cpu owns and manipulates this napi,
3342 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3343 			 * we can use a plain write instead of clear_bit(),
3344 			 * and we dont need an smp_mb() memory barrier.
3345 			 */
3346 			list_del(&napi->poll_list);
3347 			napi->state = 0;
3348 
3349 			quota = work + qlen;
3350 		}
3351 		rps_unlock(sd);
3352 	}
3353 	local_irq_enable();
3354 
3355 	return work;
3356 }
3357 
3358 /**
3359  * __napi_schedule - schedule for receive
3360  * @n: entry to schedule
3361  *
3362  * The entry's receive function will be scheduled to run
3363  */
3364 void __napi_schedule(struct napi_struct *n)
3365 {
3366 	unsigned long flags;
3367 
3368 	local_irq_save(flags);
3369 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3370 	local_irq_restore(flags);
3371 }
3372 EXPORT_SYMBOL(__napi_schedule);
3373 
3374 void __napi_complete(struct napi_struct *n)
3375 {
3376 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3377 	BUG_ON(n->gro_list);
3378 
3379 	list_del(&n->poll_list);
3380 	smp_mb__before_clear_bit();
3381 	clear_bit(NAPI_STATE_SCHED, &n->state);
3382 }
3383 EXPORT_SYMBOL(__napi_complete);
3384 
3385 void napi_complete(struct napi_struct *n)
3386 {
3387 	unsigned long flags;
3388 
3389 	/*
3390 	 * don't let napi dequeue from the cpu poll list
3391 	 * just in case its running on a different cpu
3392 	 */
3393 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3394 		return;
3395 
3396 	napi_gro_flush(n);
3397 	local_irq_save(flags);
3398 	__napi_complete(n);
3399 	local_irq_restore(flags);
3400 }
3401 EXPORT_SYMBOL(napi_complete);
3402 
3403 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3404 		    int (*poll)(struct napi_struct *, int), int weight)
3405 {
3406 	INIT_LIST_HEAD(&napi->poll_list);
3407 	napi->gro_count = 0;
3408 	napi->gro_list = NULL;
3409 	napi->skb = NULL;
3410 	napi->poll = poll;
3411 	napi->weight = weight;
3412 	list_add(&napi->dev_list, &dev->napi_list);
3413 	napi->dev = dev;
3414 #ifdef CONFIG_NETPOLL
3415 	spin_lock_init(&napi->poll_lock);
3416 	napi->poll_owner = -1;
3417 #endif
3418 	set_bit(NAPI_STATE_SCHED, &napi->state);
3419 }
3420 EXPORT_SYMBOL(netif_napi_add);
3421 
3422 void netif_napi_del(struct napi_struct *napi)
3423 {
3424 	struct sk_buff *skb, *next;
3425 
3426 	list_del_init(&napi->dev_list);
3427 	napi_free_frags(napi);
3428 
3429 	for (skb = napi->gro_list; skb; skb = next) {
3430 		next = skb->next;
3431 		skb->next = NULL;
3432 		kfree_skb(skb);
3433 	}
3434 
3435 	napi->gro_list = NULL;
3436 	napi->gro_count = 0;
3437 }
3438 EXPORT_SYMBOL(netif_napi_del);
3439 
3440 static void net_rx_action(struct softirq_action *h)
3441 {
3442 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3443 	unsigned long time_limit = jiffies + 2;
3444 	int budget = netdev_budget;
3445 	void *have;
3446 
3447 	local_irq_disable();
3448 
3449 	while (!list_empty(&sd->poll_list)) {
3450 		struct napi_struct *n;
3451 		int work, weight;
3452 
3453 		/* If softirq window is exhuasted then punt.
3454 		 * Allow this to run for 2 jiffies since which will allow
3455 		 * an average latency of 1.5/HZ.
3456 		 */
3457 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3458 			goto softnet_break;
3459 
3460 		local_irq_enable();
3461 
3462 		/* Even though interrupts have been re-enabled, this
3463 		 * access is safe because interrupts can only add new
3464 		 * entries to the tail of this list, and only ->poll()
3465 		 * calls can remove this head entry from the list.
3466 		 */
3467 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3468 
3469 		have = netpoll_poll_lock(n);
3470 
3471 		weight = n->weight;
3472 
3473 		/* This NAPI_STATE_SCHED test is for avoiding a race
3474 		 * with netpoll's poll_napi().  Only the entity which
3475 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3476 		 * actually make the ->poll() call.  Therefore we avoid
3477 		 * accidently calling ->poll() when NAPI is not scheduled.
3478 		 */
3479 		work = 0;
3480 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3481 			work = n->poll(n, weight);
3482 			trace_napi_poll(n);
3483 		}
3484 
3485 		WARN_ON_ONCE(work > weight);
3486 
3487 		budget -= work;
3488 
3489 		local_irq_disable();
3490 
3491 		/* Drivers must not modify the NAPI state if they
3492 		 * consume the entire weight.  In such cases this code
3493 		 * still "owns" the NAPI instance and therefore can
3494 		 * move the instance around on the list at-will.
3495 		 */
3496 		if (unlikely(work == weight)) {
3497 			if (unlikely(napi_disable_pending(n))) {
3498 				local_irq_enable();
3499 				napi_complete(n);
3500 				local_irq_disable();
3501 			} else
3502 				list_move_tail(&n->poll_list, &sd->poll_list);
3503 		}
3504 
3505 		netpoll_poll_unlock(have);
3506 	}
3507 out:
3508 	net_rps_action_and_irq_enable(sd);
3509 
3510 #ifdef CONFIG_NET_DMA
3511 	/*
3512 	 * There may not be any more sk_buffs coming right now, so push
3513 	 * any pending DMA copies to hardware
3514 	 */
3515 	dma_issue_pending_all();
3516 #endif
3517 
3518 	return;
3519 
3520 softnet_break:
3521 	sd->time_squeeze++;
3522 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3523 	goto out;
3524 }
3525 
3526 static gifconf_func_t *gifconf_list[NPROTO];
3527 
3528 /**
3529  *	register_gifconf	-	register a SIOCGIF handler
3530  *	@family: Address family
3531  *	@gifconf: Function handler
3532  *
3533  *	Register protocol dependent address dumping routines. The handler
3534  *	that is passed must not be freed or reused until it has been replaced
3535  *	by another handler.
3536  */
3537 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3538 {
3539 	if (family >= NPROTO)
3540 		return -EINVAL;
3541 	gifconf_list[family] = gifconf;
3542 	return 0;
3543 }
3544 EXPORT_SYMBOL(register_gifconf);
3545 
3546 
3547 /*
3548  *	Map an interface index to its name (SIOCGIFNAME)
3549  */
3550 
3551 /*
3552  *	We need this ioctl for efficient implementation of the
3553  *	if_indextoname() function required by the IPv6 API.  Without
3554  *	it, we would have to search all the interfaces to find a
3555  *	match.  --pb
3556  */
3557 
3558 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3559 {
3560 	struct net_device *dev;
3561 	struct ifreq ifr;
3562 
3563 	/*
3564 	 *	Fetch the caller's info block.
3565 	 */
3566 
3567 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3568 		return -EFAULT;
3569 
3570 	rcu_read_lock();
3571 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3572 	if (!dev) {
3573 		rcu_read_unlock();
3574 		return -ENODEV;
3575 	}
3576 
3577 	strcpy(ifr.ifr_name, dev->name);
3578 	rcu_read_unlock();
3579 
3580 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3581 		return -EFAULT;
3582 	return 0;
3583 }
3584 
3585 /*
3586  *	Perform a SIOCGIFCONF call. This structure will change
3587  *	size eventually, and there is nothing I can do about it.
3588  *	Thus we will need a 'compatibility mode'.
3589  */
3590 
3591 static int dev_ifconf(struct net *net, char __user *arg)
3592 {
3593 	struct ifconf ifc;
3594 	struct net_device *dev;
3595 	char __user *pos;
3596 	int len;
3597 	int total;
3598 	int i;
3599 
3600 	/*
3601 	 *	Fetch the caller's info block.
3602 	 */
3603 
3604 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3605 		return -EFAULT;
3606 
3607 	pos = ifc.ifc_buf;
3608 	len = ifc.ifc_len;
3609 
3610 	/*
3611 	 *	Loop over the interfaces, and write an info block for each.
3612 	 */
3613 
3614 	total = 0;
3615 	for_each_netdev(net, dev) {
3616 		for (i = 0; i < NPROTO; i++) {
3617 			if (gifconf_list[i]) {
3618 				int done;
3619 				if (!pos)
3620 					done = gifconf_list[i](dev, NULL, 0);
3621 				else
3622 					done = gifconf_list[i](dev, pos + total,
3623 							       len - total);
3624 				if (done < 0)
3625 					return -EFAULT;
3626 				total += done;
3627 			}
3628 		}
3629 	}
3630 
3631 	/*
3632 	 *	All done.  Write the updated control block back to the caller.
3633 	 */
3634 	ifc.ifc_len = total;
3635 
3636 	/*
3637 	 * 	Both BSD and Solaris return 0 here, so we do too.
3638 	 */
3639 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3640 }
3641 
3642 #ifdef CONFIG_PROC_FS
3643 /*
3644  *	This is invoked by the /proc filesystem handler to display a device
3645  *	in detail.
3646  */
3647 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3648 	__acquires(RCU)
3649 {
3650 	struct net *net = seq_file_net(seq);
3651 	loff_t off;
3652 	struct net_device *dev;
3653 
3654 	rcu_read_lock();
3655 	if (!*pos)
3656 		return SEQ_START_TOKEN;
3657 
3658 	off = 1;
3659 	for_each_netdev_rcu(net, dev)
3660 		if (off++ == *pos)
3661 			return dev;
3662 
3663 	return NULL;
3664 }
3665 
3666 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3667 {
3668 	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3669 				  first_net_device(seq_file_net(seq)) :
3670 				  next_net_device((struct net_device *)v);
3671 
3672 	++*pos;
3673 	return rcu_dereference(dev);
3674 }
3675 
3676 void dev_seq_stop(struct seq_file *seq, void *v)
3677 	__releases(RCU)
3678 {
3679 	rcu_read_unlock();
3680 }
3681 
3682 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3683 {
3684 	const struct net_device_stats *stats = dev_get_stats(dev);
3685 
3686 	seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3687 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3688 		   dev->name, stats->rx_bytes, stats->rx_packets,
3689 		   stats->rx_errors,
3690 		   stats->rx_dropped + stats->rx_missed_errors,
3691 		   stats->rx_fifo_errors,
3692 		   stats->rx_length_errors + stats->rx_over_errors +
3693 		    stats->rx_crc_errors + stats->rx_frame_errors,
3694 		   stats->rx_compressed, stats->multicast,
3695 		   stats->tx_bytes, stats->tx_packets,
3696 		   stats->tx_errors, stats->tx_dropped,
3697 		   stats->tx_fifo_errors, stats->collisions,
3698 		   stats->tx_carrier_errors +
3699 		    stats->tx_aborted_errors +
3700 		    stats->tx_window_errors +
3701 		    stats->tx_heartbeat_errors,
3702 		   stats->tx_compressed);
3703 }
3704 
3705 /*
3706  *	Called from the PROCfs module. This now uses the new arbitrary sized
3707  *	/proc/net interface to create /proc/net/dev
3708  */
3709 static int dev_seq_show(struct seq_file *seq, void *v)
3710 {
3711 	if (v == SEQ_START_TOKEN)
3712 		seq_puts(seq, "Inter-|   Receive                            "
3713 			      "                    |  Transmit\n"
3714 			      " face |bytes    packets errs drop fifo frame "
3715 			      "compressed multicast|bytes    packets errs "
3716 			      "drop fifo colls carrier compressed\n");
3717 	else
3718 		dev_seq_printf_stats(seq, v);
3719 	return 0;
3720 }
3721 
3722 static struct softnet_data *softnet_get_online(loff_t *pos)
3723 {
3724 	struct softnet_data *sd = NULL;
3725 
3726 	while (*pos < nr_cpu_ids)
3727 		if (cpu_online(*pos)) {
3728 			sd = &per_cpu(softnet_data, *pos);
3729 			break;
3730 		} else
3731 			++*pos;
3732 	return sd;
3733 }
3734 
3735 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3736 {
3737 	return softnet_get_online(pos);
3738 }
3739 
3740 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3741 {
3742 	++*pos;
3743 	return softnet_get_online(pos);
3744 }
3745 
3746 static void softnet_seq_stop(struct seq_file *seq, void *v)
3747 {
3748 }
3749 
3750 static int softnet_seq_show(struct seq_file *seq, void *v)
3751 {
3752 	struct softnet_data *sd = v;
3753 
3754 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3755 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
3756 		   0, 0, 0, 0, /* was fastroute */
3757 		   sd->cpu_collision, sd->received_rps);
3758 	return 0;
3759 }
3760 
3761 static const struct seq_operations dev_seq_ops = {
3762 	.start = dev_seq_start,
3763 	.next  = dev_seq_next,
3764 	.stop  = dev_seq_stop,
3765 	.show  = dev_seq_show,
3766 };
3767 
3768 static int dev_seq_open(struct inode *inode, struct file *file)
3769 {
3770 	return seq_open_net(inode, file, &dev_seq_ops,
3771 			    sizeof(struct seq_net_private));
3772 }
3773 
3774 static const struct file_operations dev_seq_fops = {
3775 	.owner	 = THIS_MODULE,
3776 	.open    = dev_seq_open,
3777 	.read    = seq_read,
3778 	.llseek  = seq_lseek,
3779 	.release = seq_release_net,
3780 };
3781 
3782 static const struct seq_operations softnet_seq_ops = {
3783 	.start = softnet_seq_start,
3784 	.next  = softnet_seq_next,
3785 	.stop  = softnet_seq_stop,
3786 	.show  = softnet_seq_show,
3787 };
3788 
3789 static int softnet_seq_open(struct inode *inode, struct file *file)
3790 {
3791 	return seq_open(file, &softnet_seq_ops);
3792 }
3793 
3794 static const struct file_operations softnet_seq_fops = {
3795 	.owner	 = THIS_MODULE,
3796 	.open    = softnet_seq_open,
3797 	.read    = seq_read,
3798 	.llseek  = seq_lseek,
3799 	.release = seq_release,
3800 };
3801 
3802 static void *ptype_get_idx(loff_t pos)
3803 {
3804 	struct packet_type *pt = NULL;
3805 	loff_t i = 0;
3806 	int t;
3807 
3808 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3809 		if (i == pos)
3810 			return pt;
3811 		++i;
3812 	}
3813 
3814 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3815 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3816 			if (i == pos)
3817 				return pt;
3818 			++i;
3819 		}
3820 	}
3821 	return NULL;
3822 }
3823 
3824 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3825 	__acquires(RCU)
3826 {
3827 	rcu_read_lock();
3828 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3829 }
3830 
3831 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3832 {
3833 	struct packet_type *pt;
3834 	struct list_head *nxt;
3835 	int hash;
3836 
3837 	++*pos;
3838 	if (v == SEQ_START_TOKEN)
3839 		return ptype_get_idx(0);
3840 
3841 	pt = v;
3842 	nxt = pt->list.next;
3843 	if (pt->type == htons(ETH_P_ALL)) {
3844 		if (nxt != &ptype_all)
3845 			goto found;
3846 		hash = 0;
3847 		nxt = ptype_base[0].next;
3848 	} else
3849 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3850 
3851 	while (nxt == &ptype_base[hash]) {
3852 		if (++hash >= PTYPE_HASH_SIZE)
3853 			return NULL;
3854 		nxt = ptype_base[hash].next;
3855 	}
3856 found:
3857 	return list_entry(nxt, struct packet_type, list);
3858 }
3859 
3860 static void ptype_seq_stop(struct seq_file *seq, void *v)
3861 	__releases(RCU)
3862 {
3863 	rcu_read_unlock();
3864 }
3865 
3866 static int ptype_seq_show(struct seq_file *seq, void *v)
3867 {
3868 	struct packet_type *pt = v;
3869 
3870 	if (v == SEQ_START_TOKEN)
3871 		seq_puts(seq, "Type Device      Function\n");
3872 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3873 		if (pt->type == htons(ETH_P_ALL))
3874 			seq_puts(seq, "ALL ");
3875 		else
3876 			seq_printf(seq, "%04x", ntohs(pt->type));
3877 
3878 		seq_printf(seq, " %-8s %pF\n",
3879 			   pt->dev ? pt->dev->name : "", pt->func);
3880 	}
3881 
3882 	return 0;
3883 }
3884 
3885 static const struct seq_operations ptype_seq_ops = {
3886 	.start = ptype_seq_start,
3887 	.next  = ptype_seq_next,
3888 	.stop  = ptype_seq_stop,
3889 	.show  = ptype_seq_show,
3890 };
3891 
3892 static int ptype_seq_open(struct inode *inode, struct file *file)
3893 {
3894 	return seq_open_net(inode, file, &ptype_seq_ops,
3895 			sizeof(struct seq_net_private));
3896 }
3897 
3898 static const struct file_operations ptype_seq_fops = {
3899 	.owner	 = THIS_MODULE,
3900 	.open    = ptype_seq_open,
3901 	.read    = seq_read,
3902 	.llseek  = seq_lseek,
3903 	.release = seq_release_net,
3904 };
3905 
3906 
3907 static int __net_init dev_proc_net_init(struct net *net)
3908 {
3909 	int rc = -ENOMEM;
3910 
3911 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3912 		goto out;
3913 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3914 		goto out_dev;
3915 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3916 		goto out_softnet;
3917 
3918 	if (wext_proc_init(net))
3919 		goto out_ptype;
3920 	rc = 0;
3921 out:
3922 	return rc;
3923 out_ptype:
3924 	proc_net_remove(net, "ptype");
3925 out_softnet:
3926 	proc_net_remove(net, "softnet_stat");
3927 out_dev:
3928 	proc_net_remove(net, "dev");
3929 	goto out;
3930 }
3931 
3932 static void __net_exit dev_proc_net_exit(struct net *net)
3933 {
3934 	wext_proc_exit(net);
3935 
3936 	proc_net_remove(net, "ptype");
3937 	proc_net_remove(net, "softnet_stat");
3938 	proc_net_remove(net, "dev");
3939 }
3940 
3941 static struct pernet_operations __net_initdata dev_proc_ops = {
3942 	.init = dev_proc_net_init,
3943 	.exit = dev_proc_net_exit,
3944 };
3945 
3946 static int __init dev_proc_init(void)
3947 {
3948 	return register_pernet_subsys(&dev_proc_ops);
3949 }
3950 #else
3951 #define dev_proc_init() 0
3952 #endif	/* CONFIG_PROC_FS */
3953 
3954 
3955 /**
3956  *	netdev_set_master	-	set up master/slave pair
3957  *	@slave: slave device
3958  *	@master: new master device
3959  *
3960  *	Changes the master device of the slave. Pass %NULL to break the
3961  *	bonding. The caller must hold the RTNL semaphore. On a failure
3962  *	a negative errno code is returned. On success the reference counts
3963  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3964  *	function returns zero.
3965  */
3966 int netdev_set_master(struct net_device *slave, struct net_device *master)
3967 {
3968 	struct net_device *old = slave->master;
3969 
3970 	ASSERT_RTNL();
3971 
3972 	if (master) {
3973 		if (old)
3974 			return -EBUSY;
3975 		dev_hold(master);
3976 	}
3977 
3978 	slave->master = master;
3979 
3980 	if (old) {
3981 		synchronize_net();
3982 		dev_put(old);
3983 	}
3984 	if (master)
3985 		slave->flags |= IFF_SLAVE;
3986 	else
3987 		slave->flags &= ~IFF_SLAVE;
3988 
3989 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3990 	return 0;
3991 }
3992 EXPORT_SYMBOL(netdev_set_master);
3993 
3994 static void dev_change_rx_flags(struct net_device *dev, int flags)
3995 {
3996 	const struct net_device_ops *ops = dev->netdev_ops;
3997 
3998 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3999 		ops->ndo_change_rx_flags(dev, flags);
4000 }
4001 
4002 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4003 {
4004 	unsigned short old_flags = dev->flags;
4005 	uid_t uid;
4006 	gid_t gid;
4007 
4008 	ASSERT_RTNL();
4009 
4010 	dev->flags |= IFF_PROMISC;
4011 	dev->promiscuity += inc;
4012 	if (dev->promiscuity == 0) {
4013 		/*
4014 		 * Avoid overflow.
4015 		 * If inc causes overflow, untouch promisc and return error.
4016 		 */
4017 		if (inc < 0)
4018 			dev->flags &= ~IFF_PROMISC;
4019 		else {
4020 			dev->promiscuity -= inc;
4021 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4022 				"set promiscuity failed, promiscuity feature "
4023 				"of device might be broken.\n", dev->name);
4024 			return -EOVERFLOW;
4025 		}
4026 	}
4027 	if (dev->flags != old_flags) {
4028 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4029 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4030 							       "left");
4031 		if (audit_enabled) {
4032 			current_uid_gid(&uid, &gid);
4033 			audit_log(current->audit_context, GFP_ATOMIC,
4034 				AUDIT_ANOM_PROMISCUOUS,
4035 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4036 				dev->name, (dev->flags & IFF_PROMISC),
4037 				(old_flags & IFF_PROMISC),
4038 				audit_get_loginuid(current),
4039 				uid, gid,
4040 				audit_get_sessionid(current));
4041 		}
4042 
4043 		dev_change_rx_flags(dev, IFF_PROMISC);
4044 	}
4045 	return 0;
4046 }
4047 
4048 /**
4049  *	dev_set_promiscuity	- update promiscuity count on a device
4050  *	@dev: device
4051  *	@inc: modifier
4052  *
4053  *	Add or remove promiscuity from a device. While the count in the device
4054  *	remains above zero the interface remains promiscuous. Once it hits zero
4055  *	the device reverts back to normal filtering operation. A negative inc
4056  *	value is used to drop promiscuity on the device.
4057  *	Return 0 if successful or a negative errno code on error.
4058  */
4059 int dev_set_promiscuity(struct net_device *dev, int inc)
4060 {
4061 	unsigned short old_flags = dev->flags;
4062 	int err;
4063 
4064 	err = __dev_set_promiscuity(dev, inc);
4065 	if (err < 0)
4066 		return err;
4067 	if (dev->flags != old_flags)
4068 		dev_set_rx_mode(dev);
4069 	return err;
4070 }
4071 EXPORT_SYMBOL(dev_set_promiscuity);
4072 
4073 /**
4074  *	dev_set_allmulti	- update allmulti count on a device
4075  *	@dev: device
4076  *	@inc: modifier
4077  *
4078  *	Add or remove reception of all multicast frames to a device. While the
4079  *	count in the device remains above zero the interface remains listening
4080  *	to all interfaces. Once it hits zero the device reverts back to normal
4081  *	filtering operation. A negative @inc value is used to drop the counter
4082  *	when releasing a resource needing all multicasts.
4083  *	Return 0 if successful or a negative errno code on error.
4084  */
4085 
4086 int dev_set_allmulti(struct net_device *dev, int inc)
4087 {
4088 	unsigned short old_flags = dev->flags;
4089 
4090 	ASSERT_RTNL();
4091 
4092 	dev->flags |= IFF_ALLMULTI;
4093 	dev->allmulti += inc;
4094 	if (dev->allmulti == 0) {
4095 		/*
4096 		 * Avoid overflow.
4097 		 * If inc causes overflow, untouch allmulti and return error.
4098 		 */
4099 		if (inc < 0)
4100 			dev->flags &= ~IFF_ALLMULTI;
4101 		else {
4102 			dev->allmulti -= inc;
4103 			printk(KERN_WARNING "%s: allmulti touches roof, "
4104 				"set allmulti failed, allmulti feature of "
4105 				"device might be broken.\n", dev->name);
4106 			return -EOVERFLOW;
4107 		}
4108 	}
4109 	if (dev->flags ^ old_flags) {
4110 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4111 		dev_set_rx_mode(dev);
4112 	}
4113 	return 0;
4114 }
4115 EXPORT_SYMBOL(dev_set_allmulti);
4116 
4117 /*
4118  *	Upload unicast and multicast address lists to device and
4119  *	configure RX filtering. When the device doesn't support unicast
4120  *	filtering it is put in promiscuous mode while unicast addresses
4121  *	are present.
4122  */
4123 void __dev_set_rx_mode(struct net_device *dev)
4124 {
4125 	const struct net_device_ops *ops = dev->netdev_ops;
4126 
4127 	/* dev_open will call this function so the list will stay sane. */
4128 	if (!(dev->flags&IFF_UP))
4129 		return;
4130 
4131 	if (!netif_device_present(dev))
4132 		return;
4133 
4134 	if (ops->ndo_set_rx_mode)
4135 		ops->ndo_set_rx_mode(dev);
4136 	else {
4137 		/* Unicast addresses changes may only happen under the rtnl,
4138 		 * therefore calling __dev_set_promiscuity here is safe.
4139 		 */
4140 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4141 			__dev_set_promiscuity(dev, 1);
4142 			dev->uc_promisc = 1;
4143 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4144 			__dev_set_promiscuity(dev, -1);
4145 			dev->uc_promisc = 0;
4146 		}
4147 
4148 		if (ops->ndo_set_multicast_list)
4149 			ops->ndo_set_multicast_list(dev);
4150 	}
4151 }
4152 
4153 void dev_set_rx_mode(struct net_device *dev)
4154 {
4155 	netif_addr_lock_bh(dev);
4156 	__dev_set_rx_mode(dev);
4157 	netif_addr_unlock_bh(dev);
4158 }
4159 
4160 /**
4161  *	dev_get_flags - get flags reported to userspace
4162  *	@dev: device
4163  *
4164  *	Get the combination of flag bits exported through APIs to userspace.
4165  */
4166 unsigned dev_get_flags(const struct net_device *dev)
4167 {
4168 	unsigned flags;
4169 
4170 	flags = (dev->flags & ~(IFF_PROMISC |
4171 				IFF_ALLMULTI |
4172 				IFF_RUNNING |
4173 				IFF_LOWER_UP |
4174 				IFF_DORMANT)) |
4175 		(dev->gflags & (IFF_PROMISC |
4176 				IFF_ALLMULTI));
4177 
4178 	if (netif_running(dev)) {
4179 		if (netif_oper_up(dev))
4180 			flags |= IFF_RUNNING;
4181 		if (netif_carrier_ok(dev))
4182 			flags |= IFF_LOWER_UP;
4183 		if (netif_dormant(dev))
4184 			flags |= IFF_DORMANT;
4185 	}
4186 
4187 	return flags;
4188 }
4189 EXPORT_SYMBOL(dev_get_flags);
4190 
4191 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4192 {
4193 	int old_flags = dev->flags;
4194 	int ret;
4195 
4196 	ASSERT_RTNL();
4197 
4198 	/*
4199 	 *	Set the flags on our device.
4200 	 */
4201 
4202 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4203 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4204 			       IFF_AUTOMEDIA)) |
4205 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4206 				    IFF_ALLMULTI));
4207 
4208 	/*
4209 	 *	Load in the correct multicast list now the flags have changed.
4210 	 */
4211 
4212 	if ((old_flags ^ flags) & IFF_MULTICAST)
4213 		dev_change_rx_flags(dev, IFF_MULTICAST);
4214 
4215 	dev_set_rx_mode(dev);
4216 
4217 	/*
4218 	 *	Have we downed the interface. We handle IFF_UP ourselves
4219 	 *	according to user attempts to set it, rather than blindly
4220 	 *	setting it.
4221 	 */
4222 
4223 	ret = 0;
4224 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4225 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4226 
4227 		if (!ret)
4228 			dev_set_rx_mode(dev);
4229 	}
4230 
4231 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4232 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4233 
4234 		dev->gflags ^= IFF_PROMISC;
4235 		dev_set_promiscuity(dev, inc);
4236 	}
4237 
4238 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4239 	   is important. Some (broken) drivers set IFF_PROMISC, when
4240 	   IFF_ALLMULTI is requested not asking us and not reporting.
4241 	 */
4242 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4243 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4244 
4245 		dev->gflags ^= IFF_ALLMULTI;
4246 		dev_set_allmulti(dev, inc);
4247 	}
4248 
4249 	return ret;
4250 }
4251 
4252 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4253 {
4254 	unsigned int changes = dev->flags ^ old_flags;
4255 
4256 	if (changes & IFF_UP) {
4257 		if (dev->flags & IFF_UP)
4258 			call_netdevice_notifiers(NETDEV_UP, dev);
4259 		else
4260 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4261 	}
4262 
4263 	if (dev->flags & IFF_UP &&
4264 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4265 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4266 }
4267 
4268 /**
4269  *	dev_change_flags - change device settings
4270  *	@dev: device
4271  *	@flags: device state flags
4272  *
4273  *	Change settings on device based state flags. The flags are
4274  *	in the userspace exported format.
4275  */
4276 int dev_change_flags(struct net_device *dev, unsigned flags)
4277 {
4278 	int ret, changes;
4279 	int old_flags = dev->flags;
4280 
4281 	ret = __dev_change_flags(dev, flags);
4282 	if (ret < 0)
4283 		return ret;
4284 
4285 	changes = old_flags ^ dev->flags;
4286 	if (changes)
4287 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4288 
4289 	__dev_notify_flags(dev, old_flags);
4290 	return ret;
4291 }
4292 EXPORT_SYMBOL(dev_change_flags);
4293 
4294 /**
4295  *	dev_set_mtu - Change maximum transfer unit
4296  *	@dev: device
4297  *	@new_mtu: new transfer unit
4298  *
4299  *	Change the maximum transfer size of the network device.
4300  */
4301 int dev_set_mtu(struct net_device *dev, int new_mtu)
4302 {
4303 	const struct net_device_ops *ops = dev->netdev_ops;
4304 	int err;
4305 
4306 	if (new_mtu == dev->mtu)
4307 		return 0;
4308 
4309 	/*	MTU must be positive.	 */
4310 	if (new_mtu < 0)
4311 		return -EINVAL;
4312 
4313 	if (!netif_device_present(dev))
4314 		return -ENODEV;
4315 
4316 	err = 0;
4317 	if (ops->ndo_change_mtu)
4318 		err = ops->ndo_change_mtu(dev, new_mtu);
4319 	else
4320 		dev->mtu = new_mtu;
4321 
4322 	if (!err && dev->flags & IFF_UP)
4323 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4324 	return err;
4325 }
4326 EXPORT_SYMBOL(dev_set_mtu);
4327 
4328 /**
4329  *	dev_set_mac_address - Change Media Access Control Address
4330  *	@dev: device
4331  *	@sa: new address
4332  *
4333  *	Change the hardware (MAC) address of the device
4334  */
4335 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4336 {
4337 	const struct net_device_ops *ops = dev->netdev_ops;
4338 	int err;
4339 
4340 	if (!ops->ndo_set_mac_address)
4341 		return -EOPNOTSUPP;
4342 	if (sa->sa_family != dev->type)
4343 		return -EINVAL;
4344 	if (!netif_device_present(dev))
4345 		return -ENODEV;
4346 	err = ops->ndo_set_mac_address(dev, sa);
4347 	if (!err)
4348 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4349 	return err;
4350 }
4351 EXPORT_SYMBOL(dev_set_mac_address);
4352 
4353 /*
4354  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4355  */
4356 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4357 {
4358 	int err;
4359 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4360 
4361 	if (!dev)
4362 		return -ENODEV;
4363 
4364 	switch (cmd) {
4365 	case SIOCGIFFLAGS:	/* Get interface flags */
4366 		ifr->ifr_flags = (short) dev_get_flags(dev);
4367 		return 0;
4368 
4369 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4370 				   (currently unused) */
4371 		ifr->ifr_metric = 0;
4372 		return 0;
4373 
4374 	case SIOCGIFMTU:	/* Get the MTU of a device */
4375 		ifr->ifr_mtu = dev->mtu;
4376 		return 0;
4377 
4378 	case SIOCGIFHWADDR:
4379 		if (!dev->addr_len)
4380 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4381 		else
4382 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4383 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4384 		ifr->ifr_hwaddr.sa_family = dev->type;
4385 		return 0;
4386 
4387 	case SIOCGIFSLAVE:
4388 		err = -EINVAL;
4389 		break;
4390 
4391 	case SIOCGIFMAP:
4392 		ifr->ifr_map.mem_start = dev->mem_start;
4393 		ifr->ifr_map.mem_end   = dev->mem_end;
4394 		ifr->ifr_map.base_addr = dev->base_addr;
4395 		ifr->ifr_map.irq       = dev->irq;
4396 		ifr->ifr_map.dma       = dev->dma;
4397 		ifr->ifr_map.port      = dev->if_port;
4398 		return 0;
4399 
4400 	case SIOCGIFINDEX:
4401 		ifr->ifr_ifindex = dev->ifindex;
4402 		return 0;
4403 
4404 	case SIOCGIFTXQLEN:
4405 		ifr->ifr_qlen = dev->tx_queue_len;
4406 		return 0;
4407 
4408 	default:
4409 		/* dev_ioctl() should ensure this case
4410 		 * is never reached
4411 		 */
4412 		WARN_ON(1);
4413 		err = -EINVAL;
4414 		break;
4415 
4416 	}
4417 	return err;
4418 }
4419 
4420 /*
4421  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4422  */
4423 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4424 {
4425 	int err;
4426 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4427 	const struct net_device_ops *ops;
4428 
4429 	if (!dev)
4430 		return -ENODEV;
4431 
4432 	ops = dev->netdev_ops;
4433 
4434 	switch (cmd) {
4435 	case SIOCSIFFLAGS:	/* Set interface flags */
4436 		return dev_change_flags(dev, ifr->ifr_flags);
4437 
4438 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4439 				   (currently unused) */
4440 		return -EOPNOTSUPP;
4441 
4442 	case SIOCSIFMTU:	/* Set the MTU of a device */
4443 		return dev_set_mtu(dev, ifr->ifr_mtu);
4444 
4445 	case SIOCSIFHWADDR:
4446 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4447 
4448 	case SIOCSIFHWBROADCAST:
4449 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4450 			return -EINVAL;
4451 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4452 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4453 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4454 		return 0;
4455 
4456 	case SIOCSIFMAP:
4457 		if (ops->ndo_set_config) {
4458 			if (!netif_device_present(dev))
4459 				return -ENODEV;
4460 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4461 		}
4462 		return -EOPNOTSUPP;
4463 
4464 	case SIOCADDMULTI:
4465 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4466 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4467 			return -EINVAL;
4468 		if (!netif_device_present(dev))
4469 			return -ENODEV;
4470 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4471 
4472 	case SIOCDELMULTI:
4473 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4474 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4475 			return -EINVAL;
4476 		if (!netif_device_present(dev))
4477 			return -ENODEV;
4478 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4479 
4480 	case SIOCSIFTXQLEN:
4481 		if (ifr->ifr_qlen < 0)
4482 			return -EINVAL;
4483 		dev->tx_queue_len = ifr->ifr_qlen;
4484 		return 0;
4485 
4486 	case SIOCSIFNAME:
4487 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4488 		return dev_change_name(dev, ifr->ifr_newname);
4489 
4490 	/*
4491 	 *	Unknown or private ioctl
4492 	 */
4493 	default:
4494 		if ((cmd >= SIOCDEVPRIVATE &&
4495 		    cmd <= SIOCDEVPRIVATE + 15) ||
4496 		    cmd == SIOCBONDENSLAVE ||
4497 		    cmd == SIOCBONDRELEASE ||
4498 		    cmd == SIOCBONDSETHWADDR ||
4499 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4500 		    cmd == SIOCBONDINFOQUERY ||
4501 		    cmd == SIOCBONDCHANGEACTIVE ||
4502 		    cmd == SIOCGMIIPHY ||
4503 		    cmd == SIOCGMIIREG ||
4504 		    cmd == SIOCSMIIREG ||
4505 		    cmd == SIOCBRADDIF ||
4506 		    cmd == SIOCBRDELIF ||
4507 		    cmd == SIOCSHWTSTAMP ||
4508 		    cmd == SIOCWANDEV) {
4509 			err = -EOPNOTSUPP;
4510 			if (ops->ndo_do_ioctl) {
4511 				if (netif_device_present(dev))
4512 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4513 				else
4514 					err = -ENODEV;
4515 			}
4516 		} else
4517 			err = -EINVAL;
4518 
4519 	}
4520 	return err;
4521 }
4522 
4523 /*
4524  *	This function handles all "interface"-type I/O control requests. The actual
4525  *	'doing' part of this is dev_ifsioc above.
4526  */
4527 
4528 /**
4529  *	dev_ioctl	-	network device ioctl
4530  *	@net: the applicable net namespace
4531  *	@cmd: command to issue
4532  *	@arg: pointer to a struct ifreq in user space
4533  *
4534  *	Issue ioctl functions to devices. This is normally called by the
4535  *	user space syscall interfaces but can sometimes be useful for
4536  *	other purposes. The return value is the return from the syscall if
4537  *	positive or a negative errno code on error.
4538  */
4539 
4540 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4541 {
4542 	struct ifreq ifr;
4543 	int ret;
4544 	char *colon;
4545 
4546 	/* One special case: SIOCGIFCONF takes ifconf argument
4547 	   and requires shared lock, because it sleeps writing
4548 	   to user space.
4549 	 */
4550 
4551 	if (cmd == SIOCGIFCONF) {
4552 		rtnl_lock();
4553 		ret = dev_ifconf(net, (char __user *) arg);
4554 		rtnl_unlock();
4555 		return ret;
4556 	}
4557 	if (cmd == SIOCGIFNAME)
4558 		return dev_ifname(net, (struct ifreq __user *)arg);
4559 
4560 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4561 		return -EFAULT;
4562 
4563 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4564 
4565 	colon = strchr(ifr.ifr_name, ':');
4566 	if (colon)
4567 		*colon = 0;
4568 
4569 	/*
4570 	 *	See which interface the caller is talking about.
4571 	 */
4572 
4573 	switch (cmd) {
4574 	/*
4575 	 *	These ioctl calls:
4576 	 *	- can be done by all.
4577 	 *	- atomic and do not require locking.
4578 	 *	- return a value
4579 	 */
4580 	case SIOCGIFFLAGS:
4581 	case SIOCGIFMETRIC:
4582 	case SIOCGIFMTU:
4583 	case SIOCGIFHWADDR:
4584 	case SIOCGIFSLAVE:
4585 	case SIOCGIFMAP:
4586 	case SIOCGIFINDEX:
4587 	case SIOCGIFTXQLEN:
4588 		dev_load(net, ifr.ifr_name);
4589 		rcu_read_lock();
4590 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4591 		rcu_read_unlock();
4592 		if (!ret) {
4593 			if (colon)
4594 				*colon = ':';
4595 			if (copy_to_user(arg, &ifr,
4596 					 sizeof(struct ifreq)))
4597 				ret = -EFAULT;
4598 		}
4599 		return ret;
4600 
4601 	case SIOCETHTOOL:
4602 		dev_load(net, ifr.ifr_name);
4603 		rtnl_lock();
4604 		ret = dev_ethtool(net, &ifr);
4605 		rtnl_unlock();
4606 		if (!ret) {
4607 			if (colon)
4608 				*colon = ':';
4609 			if (copy_to_user(arg, &ifr,
4610 					 sizeof(struct ifreq)))
4611 				ret = -EFAULT;
4612 		}
4613 		return ret;
4614 
4615 	/*
4616 	 *	These ioctl calls:
4617 	 *	- require superuser power.
4618 	 *	- require strict serialization.
4619 	 *	- return a value
4620 	 */
4621 	case SIOCGMIIPHY:
4622 	case SIOCGMIIREG:
4623 	case SIOCSIFNAME:
4624 		if (!capable(CAP_NET_ADMIN))
4625 			return -EPERM;
4626 		dev_load(net, ifr.ifr_name);
4627 		rtnl_lock();
4628 		ret = dev_ifsioc(net, &ifr, cmd);
4629 		rtnl_unlock();
4630 		if (!ret) {
4631 			if (colon)
4632 				*colon = ':';
4633 			if (copy_to_user(arg, &ifr,
4634 					 sizeof(struct ifreq)))
4635 				ret = -EFAULT;
4636 		}
4637 		return ret;
4638 
4639 	/*
4640 	 *	These ioctl calls:
4641 	 *	- require superuser power.
4642 	 *	- require strict serialization.
4643 	 *	- do not return a value
4644 	 */
4645 	case SIOCSIFFLAGS:
4646 	case SIOCSIFMETRIC:
4647 	case SIOCSIFMTU:
4648 	case SIOCSIFMAP:
4649 	case SIOCSIFHWADDR:
4650 	case SIOCSIFSLAVE:
4651 	case SIOCADDMULTI:
4652 	case SIOCDELMULTI:
4653 	case SIOCSIFHWBROADCAST:
4654 	case SIOCSIFTXQLEN:
4655 	case SIOCSMIIREG:
4656 	case SIOCBONDENSLAVE:
4657 	case SIOCBONDRELEASE:
4658 	case SIOCBONDSETHWADDR:
4659 	case SIOCBONDCHANGEACTIVE:
4660 	case SIOCBRADDIF:
4661 	case SIOCBRDELIF:
4662 	case SIOCSHWTSTAMP:
4663 		if (!capable(CAP_NET_ADMIN))
4664 			return -EPERM;
4665 		/* fall through */
4666 	case SIOCBONDSLAVEINFOQUERY:
4667 	case SIOCBONDINFOQUERY:
4668 		dev_load(net, ifr.ifr_name);
4669 		rtnl_lock();
4670 		ret = dev_ifsioc(net, &ifr, cmd);
4671 		rtnl_unlock();
4672 		return ret;
4673 
4674 	case SIOCGIFMEM:
4675 		/* Get the per device memory space. We can add this but
4676 		 * currently do not support it */
4677 	case SIOCSIFMEM:
4678 		/* Set the per device memory buffer space.
4679 		 * Not applicable in our case */
4680 	case SIOCSIFLINK:
4681 		return -EINVAL;
4682 
4683 	/*
4684 	 *	Unknown or private ioctl.
4685 	 */
4686 	default:
4687 		if (cmd == SIOCWANDEV ||
4688 		    (cmd >= SIOCDEVPRIVATE &&
4689 		     cmd <= SIOCDEVPRIVATE + 15)) {
4690 			dev_load(net, ifr.ifr_name);
4691 			rtnl_lock();
4692 			ret = dev_ifsioc(net, &ifr, cmd);
4693 			rtnl_unlock();
4694 			if (!ret && copy_to_user(arg, &ifr,
4695 						 sizeof(struct ifreq)))
4696 				ret = -EFAULT;
4697 			return ret;
4698 		}
4699 		/* Take care of Wireless Extensions */
4700 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4701 			return wext_handle_ioctl(net, &ifr, cmd, arg);
4702 		return -EINVAL;
4703 	}
4704 }
4705 
4706 
4707 /**
4708  *	dev_new_index	-	allocate an ifindex
4709  *	@net: the applicable net namespace
4710  *
4711  *	Returns a suitable unique value for a new device interface
4712  *	number.  The caller must hold the rtnl semaphore or the
4713  *	dev_base_lock to be sure it remains unique.
4714  */
4715 static int dev_new_index(struct net *net)
4716 {
4717 	static int ifindex;
4718 	for (;;) {
4719 		if (++ifindex <= 0)
4720 			ifindex = 1;
4721 		if (!__dev_get_by_index(net, ifindex))
4722 			return ifindex;
4723 	}
4724 }
4725 
4726 /* Delayed registration/unregisteration */
4727 static LIST_HEAD(net_todo_list);
4728 
4729 static void net_set_todo(struct net_device *dev)
4730 {
4731 	list_add_tail(&dev->todo_list, &net_todo_list);
4732 }
4733 
4734 static void rollback_registered_many(struct list_head *head)
4735 {
4736 	struct net_device *dev, *tmp;
4737 
4738 	BUG_ON(dev_boot_phase);
4739 	ASSERT_RTNL();
4740 
4741 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4742 		/* Some devices call without registering
4743 		 * for initialization unwind. Remove those
4744 		 * devices and proceed with the remaining.
4745 		 */
4746 		if (dev->reg_state == NETREG_UNINITIALIZED) {
4747 			pr_debug("unregister_netdevice: device %s/%p never "
4748 				 "was registered\n", dev->name, dev);
4749 
4750 			WARN_ON(1);
4751 			list_del(&dev->unreg_list);
4752 			continue;
4753 		}
4754 
4755 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4756 
4757 		/* If device is running, close it first. */
4758 		dev_close(dev);
4759 
4760 		/* And unlink it from device chain. */
4761 		unlist_netdevice(dev);
4762 
4763 		dev->reg_state = NETREG_UNREGISTERING;
4764 	}
4765 
4766 	synchronize_net();
4767 
4768 	list_for_each_entry(dev, head, unreg_list) {
4769 		/* Shutdown queueing discipline. */
4770 		dev_shutdown(dev);
4771 
4772 
4773 		/* Notify protocols, that we are about to destroy
4774 		   this device. They should clean all the things.
4775 		*/
4776 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4777 
4778 		if (!dev->rtnl_link_ops ||
4779 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4780 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4781 
4782 		/*
4783 		 *	Flush the unicast and multicast chains
4784 		 */
4785 		dev_uc_flush(dev);
4786 		dev_mc_flush(dev);
4787 
4788 		if (dev->netdev_ops->ndo_uninit)
4789 			dev->netdev_ops->ndo_uninit(dev);
4790 
4791 		/* Notifier chain MUST detach us from master device. */
4792 		WARN_ON(dev->master);
4793 
4794 		/* Remove entries from kobject tree */
4795 		netdev_unregister_kobject(dev);
4796 	}
4797 
4798 	/* Process any work delayed until the end of the batch */
4799 	dev = list_first_entry(head, struct net_device, unreg_list);
4800 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4801 
4802 	synchronize_net();
4803 
4804 	list_for_each_entry(dev, head, unreg_list)
4805 		dev_put(dev);
4806 }
4807 
4808 static void rollback_registered(struct net_device *dev)
4809 {
4810 	LIST_HEAD(single);
4811 
4812 	list_add(&dev->unreg_list, &single);
4813 	rollback_registered_many(&single);
4814 }
4815 
4816 static void __netdev_init_queue_locks_one(struct net_device *dev,
4817 					  struct netdev_queue *dev_queue,
4818 					  void *_unused)
4819 {
4820 	spin_lock_init(&dev_queue->_xmit_lock);
4821 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4822 	dev_queue->xmit_lock_owner = -1;
4823 }
4824 
4825 static void netdev_init_queue_locks(struct net_device *dev)
4826 {
4827 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4828 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4829 }
4830 
4831 unsigned long netdev_fix_features(unsigned long features, const char *name)
4832 {
4833 	/* Fix illegal SG+CSUM combinations. */
4834 	if ((features & NETIF_F_SG) &&
4835 	    !(features & NETIF_F_ALL_CSUM)) {
4836 		if (name)
4837 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4838 			       "checksum feature.\n", name);
4839 		features &= ~NETIF_F_SG;
4840 	}
4841 
4842 	/* TSO requires that SG is present as well. */
4843 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4844 		if (name)
4845 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4846 			       "SG feature.\n", name);
4847 		features &= ~NETIF_F_TSO;
4848 	}
4849 
4850 	if (features & NETIF_F_UFO) {
4851 		if (!(features & NETIF_F_GEN_CSUM)) {
4852 			if (name)
4853 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4854 				       "since no NETIF_F_HW_CSUM feature.\n",
4855 				       name);
4856 			features &= ~NETIF_F_UFO;
4857 		}
4858 
4859 		if (!(features & NETIF_F_SG)) {
4860 			if (name)
4861 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4862 				       "since no NETIF_F_SG feature.\n", name);
4863 			features &= ~NETIF_F_UFO;
4864 		}
4865 	}
4866 
4867 	return features;
4868 }
4869 EXPORT_SYMBOL(netdev_fix_features);
4870 
4871 /**
4872  *	netif_stacked_transfer_operstate -	transfer operstate
4873  *	@rootdev: the root or lower level device to transfer state from
4874  *	@dev: the device to transfer operstate to
4875  *
4876  *	Transfer operational state from root to device. This is normally
4877  *	called when a stacking relationship exists between the root
4878  *	device and the device(a leaf device).
4879  */
4880 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4881 					struct net_device *dev)
4882 {
4883 	if (rootdev->operstate == IF_OPER_DORMANT)
4884 		netif_dormant_on(dev);
4885 	else
4886 		netif_dormant_off(dev);
4887 
4888 	if (netif_carrier_ok(rootdev)) {
4889 		if (!netif_carrier_ok(dev))
4890 			netif_carrier_on(dev);
4891 	} else {
4892 		if (netif_carrier_ok(dev))
4893 			netif_carrier_off(dev);
4894 	}
4895 }
4896 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4897 
4898 /**
4899  *	register_netdevice	- register a network device
4900  *	@dev: device to register
4901  *
4902  *	Take a completed network device structure and add it to the kernel
4903  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4904  *	chain. 0 is returned on success. A negative errno code is returned
4905  *	on a failure to set up the device, or if the name is a duplicate.
4906  *
4907  *	Callers must hold the rtnl semaphore. You may want
4908  *	register_netdev() instead of this.
4909  *
4910  *	BUGS:
4911  *	The locking appears insufficient to guarantee two parallel registers
4912  *	will not get the same name.
4913  */
4914 
4915 int register_netdevice(struct net_device *dev)
4916 {
4917 	int ret;
4918 	struct net *net = dev_net(dev);
4919 
4920 	BUG_ON(dev_boot_phase);
4921 	ASSERT_RTNL();
4922 
4923 	might_sleep();
4924 
4925 	/* When net_device's are persistent, this will be fatal. */
4926 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4927 	BUG_ON(!net);
4928 
4929 	spin_lock_init(&dev->addr_list_lock);
4930 	netdev_set_addr_lockdep_class(dev);
4931 	netdev_init_queue_locks(dev);
4932 
4933 	dev->iflink = -1;
4934 
4935 #ifdef CONFIG_RPS
4936 	if (!dev->num_rx_queues) {
4937 		/*
4938 		 * Allocate a single RX queue if driver never called
4939 		 * alloc_netdev_mq
4940 		 */
4941 
4942 		dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4943 		if (!dev->_rx) {
4944 			ret = -ENOMEM;
4945 			goto out;
4946 		}
4947 
4948 		dev->_rx->first = dev->_rx;
4949 		atomic_set(&dev->_rx->count, 1);
4950 		dev->num_rx_queues = 1;
4951 	}
4952 #endif
4953 	/* Init, if this function is available */
4954 	if (dev->netdev_ops->ndo_init) {
4955 		ret = dev->netdev_ops->ndo_init(dev);
4956 		if (ret) {
4957 			if (ret > 0)
4958 				ret = -EIO;
4959 			goto out;
4960 		}
4961 	}
4962 
4963 	ret = dev_get_valid_name(net, dev->name, dev->name, 0);
4964 	if (ret)
4965 		goto err_uninit;
4966 
4967 	dev->ifindex = dev_new_index(net);
4968 	if (dev->iflink == -1)
4969 		dev->iflink = dev->ifindex;
4970 
4971 	/* Fix illegal checksum combinations */
4972 	if ((dev->features & NETIF_F_HW_CSUM) &&
4973 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4974 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4975 		       dev->name);
4976 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4977 	}
4978 
4979 	if ((dev->features & NETIF_F_NO_CSUM) &&
4980 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4981 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4982 		       dev->name);
4983 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4984 	}
4985 
4986 	dev->features = netdev_fix_features(dev->features, dev->name);
4987 
4988 	/* Enable software GSO if SG is supported. */
4989 	if (dev->features & NETIF_F_SG)
4990 		dev->features |= NETIF_F_GSO;
4991 
4992 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
4993 	ret = notifier_to_errno(ret);
4994 	if (ret)
4995 		goto err_uninit;
4996 
4997 	ret = netdev_register_kobject(dev);
4998 	if (ret)
4999 		goto err_uninit;
5000 	dev->reg_state = NETREG_REGISTERED;
5001 
5002 	/*
5003 	 *	Default initial state at registry is that the
5004 	 *	device is present.
5005 	 */
5006 
5007 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5008 
5009 	dev_init_scheduler(dev);
5010 	dev_hold(dev);
5011 	list_netdevice(dev);
5012 
5013 	/* Notify protocols, that a new device appeared. */
5014 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5015 	ret = notifier_to_errno(ret);
5016 	if (ret) {
5017 		rollback_registered(dev);
5018 		dev->reg_state = NETREG_UNREGISTERED;
5019 	}
5020 	/*
5021 	 *	Prevent userspace races by waiting until the network
5022 	 *	device is fully setup before sending notifications.
5023 	 */
5024 	if (!dev->rtnl_link_ops ||
5025 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5026 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5027 
5028 out:
5029 	return ret;
5030 
5031 err_uninit:
5032 	if (dev->netdev_ops->ndo_uninit)
5033 		dev->netdev_ops->ndo_uninit(dev);
5034 	goto out;
5035 }
5036 EXPORT_SYMBOL(register_netdevice);
5037 
5038 /**
5039  *	init_dummy_netdev	- init a dummy network device for NAPI
5040  *	@dev: device to init
5041  *
5042  *	This takes a network device structure and initialize the minimum
5043  *	amount of fields so it can be used to schedule NAPI polls without
5044  *	registering a full blown interface. This is to be used by drivers
5045  *	that need to tie several hardware interfaces to a single NAPI
5046  *	poll scheduler due to HW limitations.
5047  */
5048 int init_dummy_netdev(struct net_device *dev)
5049 {
5050 	/* Clear everything. Note we don't initialize spinlocks
5051 	 * are they aren't supposed to be taken by any of the
5052 	 * NAPI code and this dummy netdev is supposed to be
5053 	 * only ever used for NAPI polls
5054 	 */
5055 	memset(dev, 0, sizeof(struct net_device));
5056 
5057 	/* make sure we BUG if trying to hit standard
5058 	 * register/unregister code path
5059 	 */
5060 	dev->reg_state = NETREG_DUMMY;
5061 
5062 	/* initialize the ref count */
5063 	atomic_set(&dev->refcnt, 1);
5064 
5065 	/* NAPI wants this */
5066 	INIT_LIST_HEAD(&dev->napi_list);
5067 
5068 	/* a dummy interface is started by default */
5069 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5070 	set_bit(__LINK_STATE_START, &dev->state);
5071 
5072 	return 0;
5073 }
5074 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5075 
5076 
5077 /**
5078  *	register_netdev	- register a network device
5079  *	@dev: device to register
5080  *
5081  *	Take a completed network device structure and add it to the kernel
5082  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5083  *	chain. 0 is returned on success. A negative errno code is returned
5084  *	on a failure to set up the device, or if the name is a duplicate.
5085  *
5086  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5087  *	and expands the device name if you passed a format string to
5088  *	alloc_netdev.
5089  */
5090 int register_netdev(struct net_device *dev)
5091 {
5092 	int err;
5093 
5094 	rtnl_lock();
5095 
5096 	/*
5097 	 * If the name is a format string the caller wants us to do a
5098 	 * name allocation.
5099 	 */
5100 	if (strchr(dev->name, '%')) {
5101 		err = dev_alloc_name(dev, dev->name);
5102 		if (err < 0)
5103 			goto out;
5104 	}
5105 
5106 	err = register_netdevice(dev);
5107 out:
5108 	rtnl_unlock();
5109 	return err;
5110 }
5111 EXPORT_SYMBOL(register_netdev);
5112 
5113 /*
5114  * netdev_wait_allrefs - wait until all references are gone.
5115  *
5116  * This is called when unregistering network devices.
5117  *
5118  * Any protocol or device that holds a reference should register
5119  * for netdevice notification, and cleanup and put back the
5120  * reference if they receive an UNREGISTER event.
5121  * We can get stuck here if buggy protocols don't correctly
5122  * call dev_put.
5123  */
5124 static void netdev_wait_allrefs(struct net_device *dev)
5125 {
5126 	unsigned long rebroadcast_time, warning_time;
5127 
5128 	linkwatch_forget_dev(dev);
5129 
5130 	rebroadcast_time = warning_time = jiffies;
5131 	while (atomic_read(&dev->refcnt) != 0) {
5132 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5133 			rtnl_lock();
5134 
5135 			/* Rebroadcast unregister notification */
5136 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5137 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5138 			 * should have already handle it the first time */
5139 
5140 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5141 				     &dev->state)) {
5142 				/* We must not have linkwatch events
5143 				 * pending on unregister. If this
5144 				 * happens, we simply run the queue
5145 				 * unscheduled, resulting in a noop
5146 				 * for this device.
5147 				 */
5148 				linkwatch_run_queue();
5149 			}
5150 
5151 			__rtnl_unlock();
5152 
5153 			rebroadcast_time = jiffies;
5154 		}
5155 
5156 		msleep(250);
5157 
5158 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5159 			printk(KERN_EMERG "unregister_netdevice: "
5160 			       "waiting for %s to become free. Usage "
5161 			       "count = %d\n",
5162 			       dev->name, atomic_read(&dev->refcnt));
5163 			warning_time = jiffies;
5164 		}
5165 	}
5166 }
5167 
5168 /* The sequence is:
5169  *
5170  *	rtnl_lock();
5171  *	...
5172  *	register_netdevice(x1);
5173  *	register_netdevice(x2);
5174  *	...
5175  *	unregister_netdevice(y1);
5176  *	unregister_netdevice(y2);
5177  *      ...
5178  *	rtnl_unlock();
5179  *	free_netdev(y1);
5180  *	free_netdev(y2);
5181  *
5182  * We are invoked by rtnl_unlock().
5183  * This allows us to deal with problems:
5184  * 1) We can delete sysfs objects which invoke hotplug
5185  *    without deadlocking with linkwatch via keventd.
5186  * 2) Since we run with the RTNL semaphore not held, we can sleep
5187  *    safely in order to wait for the netdev refcnt to drop to zero.
5188  *
5189  * We must not return until all unregister events added during
5190  * the interval the lock was held have been completed.
5191  */
5192 void netdev_run_todo(void)
5193 {
5194 	struct list_head list;
5195 
5196 	/* Snapshot list, allow later requests */
5197 	list_replace_init(&net_todo_list, &list);
5198 
5199 	__rtnl_unlock();
5200 
5201 	while (!list_empty(&list)) {
5202 		struct net_device *dev
5203 			= list_first_entry(&list, struct net_device, todo_list);
5204 		list_del(&dev->todo_list);
5205 
5206 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5207 			printk(KERN_ERR "network todo '%s' but state %d\n",
5208 			       dev->name, dev->reg_state);
5209 			dump_stack();
5210 			continue;
5211 		}
5212 
5213 		dev->reg_state = NETREG_UNREGISTERED;
5214 
5215 		on_each_cpu(flush_backlog, dev, 1);
5216 
5217 		netdev_wait_allrefs(dev);
5218 
5219 		/* paranoia */
5220 		BUG_ON(atomic_read(&dev->refcnt));
5221 		WARN_ON(dev->ip_ptr);
5222 		WARN_ON(dev->ip6_ptr);
5223 		WARN_ON(dev->dn_ptr);
5224 
5225 		if (dev->destructor)
5226 			dev->destructor(dev);
5227 
5228 		/* Free network device */
5229 		kobject_put(&dev->dev.kobj);
5230 	}
5231 }
5232 
5233 /**
5234  *	dev_txq_stats_fold - fold tx_queues stats
5235  *	@dev: device to get statistics from
5236  *	@stats: struct net_device_stats to hold results
5237  */
5238 void dev_txq_stats_fold(const struct net_device *dev,
5239 			struct net_device_stats *stats)
5240 {
5241 	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5242 	unsigned int i;
5243 	struct netdev_queue *txq;
5244 
5245 	for (i = 0; i < dev->num_tx_queues; i++) {
5246 		txq = netdev_get_tx_queue(dev, i);
5247 		tx_bytes   += txq->tx_bytes;
5248 		tx_packets += txq->tx_packets;
5249 		tx_dropped += txq->tx_dropped;
5250 	}
5251 	if (tx_bytes || tx_packets || tx_dropped) {
5252 		stats->tx_bytes   = tx_bytes;
5253 		stats->tx_packets = tx_packets;
5254 		stats->tx_dropped = tx_dropped;
5255 	}
5256 }
5257 EXPORT_SYMBOL(dev_txq_stats_fold);
5258 
5259 /**
5260  *	dev_get_stats	- get network device statistics
5261  *	@dev: device to get statistics from
5262  *
5263  *	Get network statistics from device. The device driver may provide
5264  *	its own method by setting dev->netdev_ops->get_stats; otherwise
5265  *	the internal statistics structure is used.
5266  */
5267 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5268 {
5269 	const struct net_device_ops *ops = dev->netdev_ops;
5270 
5271 	if (ops->ndo_get_stats)
5272 		return ops->ndo_get_stats(dev);
5273 
5274 	dev_txq_stats_fold(dev, &dev->stats);
5275 	return &dev->stats;
5276 }
5277 EXPORT_SYMBOL(dev_get_stats);
5278 
5279 static void netdev_init_one_queue(struct net_device *dev,
5280 				  struct netdev_queue *queue,
5281 				  void *_unused)
5282 {
5283 	queue->dev = dev;
5284 }
5285 
5286 static void netdev_init_queues(struct net_device *dev)
5287 {
5288 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5289 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5290 	spin_lock_init(&dev->tx_global_lock);
5291 }
5292 
5293 /**
5294  *	alloc_netdev_mq - allocate network device
5295  *	@sizeof_priv:	size of private data to allocate space for
5296  *	@name:		device name format string
5297  *	@setup:		callback to initialize device
5298  *	@queue_count:	the number of subqueues to allocate
5299  *
5300  *	Allocates a struct net_device with private data area for driver use
5301  *	and performs basic initialization.  Also allocates subquue structs
5302  *	for each queue on the device at the end of the netdevice.
5303  */
5304 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5305 		void (*setup)(struct net_device *), unsigned int queue_count)
5306 {
5307 	struct netdev_queue *tx;
5308 	struct net_device *dev;
5309 	size_t alloc_size;
5310 	struct net_device *p;
5311 #ifdef CONFIG_RPS
5312 	struct netdev_rx_queue *rx;
5313 	int i;
5314 #endif
5315 
5316 	BUG_ON(strlen(name) >= sizeof(dev->name));
5317 
5318 	alloc_size = sizeof(struct net_device);
5319 	if (sizeof_priv) {
5320 		/* ensure 32-byte alignment of private area */
5321 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5322 		alloc_size += sizeof_priv;
5323 	}
5324 	/* ensure 32-byte alignment of whole construct */
5325 	alloc_size += NETDEV_ALIGN - 1;
5326 
5327 	p = kzalloc(alloc_size, GFP_KERNEL);
5328 	if (!p) {
5329 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5330 		return NULL;
5331 	}
5332 
5333 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5334 	if (!tx) {
5335 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5336 		       "tx qdiscs.\n");
5337 		goto free_p;
5338 	}
5339 
5340 #ifdef CONFIG_RPS
5341 	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5342 	if (!rx) {
5343 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5344 		       "rx queues.\n");
5345 		goto free_tx;
5346 	}
5347 
5348 	atomic_set(&rx->count, queue_count);
5349 
5350 	/*
5351 	 * Set a pointer to first element in the array which holds the
5352 	 * reference count.
5353 	 */
5354 	for (i = 0; i < queue_count; i++)
5355 		rx[i].first = rx;
5356 #endif
5357 
5358 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5359 	dev->padded = (char *)dev - (char *)p;
5360 
5361 	if (dev_addr_init(dev))
5362 		goto free_rx;
5363 
5364 	dev_mc_init(dev);
5365 	dev_uc_init(dev);
5366 
5367 	dev_net_set(dev, &init_net);
5368 
5369 	dev->_tx = tx;
5370 	dev->num_tx_queues = queue_count;
5371 	dev->real_num_tx_queues = queue_count;
5372 
5373 #ifdef CONFIG_RPS
5374 	dev->_rx = rx;
5375 	dev->num_rx_queues = queue_count;
5376 #endif
5377 
5378 	dev->gso_max_size = GSO_MAX_SIZE;
5379 
5380 	netdev_init_queues(dev);
5381 
5382 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5383 	dev->ethtool_ntuple_list.count = 0;
5384 	INIT_LIST_HEAD(&dev->napi_list);
5385 	INIT_LIST_HEAD(&dev->unreg_list);
5386 	INIT_LIST_HEAD(&dev->link_watch_list);
5387 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5388 	setup(dev);
5389 	strcpy(dev->name, name);
5390 	return dev;
5391 
5392 free_rx:
5393 #ifdef CONFIG_RPS
5394 	kfree(rx);
5395 free_tx:
5396 #endif
5397 	kfree(tx);
5398 free_p:
5399 	kfree(p);
5400 	return NULL;
5401 }
5402 EXPORT_SYMBOL(alloc_netdev_mq);
5403 
5404 /**
5405  *	free_netdev - free network device
5406  *	@dev: device
5407  *
5408  *	This function does the last stage of destroying an allocated device
5409  * 	interface. The reference to the device object is released.
5410  *	If this is the last reference then it will be freed.
5411  */
5412 void free_netdev(struct net_device *dev)
5413 {
5414 	struct napi_struct *p, *n;
5415 
5416 	release_net(dev_net(dev));
5417 
5418 	kfree(dev->_tx);
5419 
5420 	/* Flush device addresses */
5421 	dev_addr_flush(dev);
5422 
5423 	/* Clear ethtool n-tuple list */
5424 	ethtool_ntuple_flush(dev);
5425 
5426 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5427 		netif_napi_del(p);
5428 
5429 	/*  Compatibility with error handling in drivers */
5430 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5431 		kfree((char *)dev - dev->padded);
5432 		return;
5433 	}
5434 
5435 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5436 	dev->reg_state = NETREG_RELEASED;
5437 
5438 	/* will free via device release */
5439 	put_device(&dev->dev);
5440 }
5441 EXPORT_SYMBOL(free_netdev);
5442 
5443 /**
5444  *	synchronize_net -  Synchronize with packet receive processing
5445  *
5446  *	Wait for packets currently being received to be done.
5447  *	Does not block later packets from starting.
5448  */
5449 void synchronize_net(void)
5450 {
5451 	might_sleep();
5452 	synchronize_rcu();
5453 }
5454 EXPORT_SYMBOL(synchronize_net);
5455 
5456 /**
5457  *	unregister_netdevice_queue - remove device from the kernel
5458  *	@dev: device
5459  *	@head: list
5460  *
5461  *	This function shuts down a device interface and removes it
5462  *	from the kernel tables.
5463  *	If head not NULL, device is queued to be unregistered later.
5464  *
5465  *	Callers must hold the rtnl semaphore.  You may want
5466  *	unregister_netdev() instead of this.
5467  */
5468 
5469 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5470 {
5471 	ASSERT_RTNL();
5472 
5473 	if (head) {
5474 		list_move_tail(&dev->unreg_list, head);
5475 	} else {
5476 		rollback_registered(dev);
5477 		/* Finish processing unregister after unlock */
5478 		net_set_todo(dev);
5479 	}
5480 }
5481 EXPORT_SYMBOL(unregister_netdevice_queue);
5482 
5483 /**
5484  *	unregister_netdevice_many - unregister many devices
5485  *	@head: list of devices
5486  */
5487 void unregister_netdevice_many(struct list_head *head)
5488 {
5489 	struct net_device *dev;
5490 
5491 	if (!list_empty(head)) {
5492 		rollback_registered_many(head);
5493 		list_for_each_entry(dev, head, unreg_list)
5494 			net_set_todo(dev);
5495 	}
5496 }
5497 EXPORT_SYMBOL(unregister_netdevice_many);
5498 
5499 /**
5500  *	unregister_netdev - remove device from the kernel
5501  *	@dev: device
5502  *
5503  *	This function shuts down a device interface and removes it
5504  *	from the kernel tables.
5505  *
5506  *	This is just a wrapper for unregister_netdevice that takes
5507  *	the rtnl semaphore.  In general you want to use this and not
5508  *	unregister_netdevice.
5509  */
5510 void unregister_netdev(struct net_device *dev)
5511 {
5512 	rtnl_lock();
5513 	unregister_netdevice(dev);
5514 	rtnl_unlock();
5515 }
5516 EXPORT_SYMBOL(unregister_netdev);
5517 
5518 /**
5519  *	dev_change_net_namespace - move device to different nethost namespace
5520  *	@dev: device
5521  *	@net: network namespace
5522  *	@pat: If not NULL name pattern to try if the current device name
5523  *	      is already taken in the destination network namespace.
5524  *
5525  *	This function shuts down a device interface and moves it
5526  *	to a new network namespace. On success 0 is returned, on
5527  *	a failure a netagive errno code is returned.
5528  *
5529  *	Callers must hold the rtnl semaphore.
5530  */
5531 
5532 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5533 {
5534 	int err;
5535 
5536 	ASSERT_RTNL();
5537 
5538 	/* Don't allow namespace local devices to be moved. */
5539 	err = -EINVAL;
5540 	if (dev->features & NETIF_F_NETNS_LOCAL)
5541 		goto out;
5542 
5543 	/* Ensure the device has been registrered */
5544 	err = -EINVAL;
5545 	if (dev->reg_state != NETREG_REGISTERED)
5546 		goto out;
5547 
5548 	/* Get out if there is nothing todo */
5549 	err = 0;
5550 	if (net_eq(dev_net(dev), net))
5551 		goto out;
5552 
5553 	/* Pick the destination device name, and ensure
5554 	 * we can use it in the destination network namespace.
5555 	 */
5556 	err = -EEXIST;
5557 	if (__dev_get_by_name(net, dev->name)) {
5558 		/* We get here if we can't use the current device name */
5559 		if (!pat)
5560 			goto out;
5561 		if (dev_get_valid_name(net, pat, dev->name, 1))
5562 			goto out;
5563 	}
5564 
5565 	/*
5566 	 * And now a mini version of register_netdevice unregister_netdevice.
5567 	 */
5568 
5569 	/* If device is running close it first. */
5570 	dev_close(dev);
5571 
5572 	/* And unlink it from device chain */
5573 	err = -ENODEV;
5574 	unlist_netdevice(dev);
5575 
5576 	synchronize_net();
5577 
5578 	/* Shutdown queueing discipline. */
5579 	dev_shutdown(dev);
5580 
5581 	/* Notify protocols, that we are about to destroy
5582 	   this device. They should clean all the things.
5583 	*/
5584 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5585 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5586 
5587 	/*
5588 	 *	Flush the unicast and multicast chains
5589 	 */
5590 	dev_uc_flush(dev);
5591 	dev_mc_flush(dev);
5592 
5593 	/* Actually switch the network namespace */
5594 	dev_net_set(dev, net);
5595 
5596 	/* If there is an ifindex conflict assign a new one */
5597 	if (__dev_get_by_index(net, dev->ifindex)) {
5598 		int iflink = (dev->iflink == dev->ifindex);
5599 		dev->ifindex = dev_new_index(net);
5600 		if (iflink)
5601 			dev->iflink = dev->ifindex;
5602 	}
5603 
5604 	/* Fixup kobjects */
5605 	err = device_rename(&dev->dev, dev->name);
5606 	WARN_ON(err);
5607 
5608 	/* Add the device back in the hashes */
5609 	list_netdevice(dev);
5610 
5611 	/* Notify protocols, that a new device appeared. */
5612 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5613 
5614 	/*
5615 	 *	Prevent userspace races by waiting until the network
5616 	 *	device is fully setup before sending notifications.
5617 	 */
5618 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5619 
5620 	synchronize_net();
5621 	err = 0;
5622 out:
5623 	return err;
5624 }
5625 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5626 
5627 static int dev_cpu_callback(struct notifier_block *nfb,
5628 			    unsigned long action,
5629 			    void *ocpu)
5630 {
5631 	struct sk_buff **list_skb;
5632 	struct sk_buff *skb;
5633 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5634 	struct softnet_data *sd, *oldsd;
5635 
5636 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5637 		return NOTIFY_OK;
5638 
5639 	local_irq_disable();
5640 	cpu = smp_processor_id();
5641 	sd = &per_cpu(softnet_data, cpu);
5642 	oldsd = &per_cpu(softnet_data, oldcpu);
5643 
5644 	/* Find end of our completion_queue. */
5645 	list_skb = &sd->completion_queue;
5646 	while (*list_skb)
5647 		list_skb = &(*list_skb)->next;
5648 	/* Append completion queue from offline CPU. */
5649 	*list_skb = oldsd->completion_queue;
5650 	oldsd->completion_queue = NULL;
5651 
5652 	/* Append output queue from offline CPU. */
5653 	if (oldsd->output_queue) {
5654 		*sd->output_queue_tailp = oldsd->output_queue;
5655 		sd->output_queue_tailp = oldsd->output_queue_tailp;
5656 		oldsd->output_queue = NULL;
5657 		oldsd->output_queue_tailp = &oldsd->output_queue;
5658 	}
5659 
5660 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5661 	local_irq_enable();
5662 
5663 	/* Process offline CPU's input_pkt_queue */
5664 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5665 		netif_rx(skb);
5666 		input_queue_head_add(oldsd, 1);
5667 	}
5668 	while ((skb = __skb_dequeue(&oldsd->process_queue)))
5669 		netif_rx(skb);
5670 
5671 	return NOTIFY_OK;
5672 }
5673 
5674 
5675 /**
5676  *	netdev_increment_features - increment feature set by one
5677  *	@all: current feature set
5678  *	@one: new feature set
5679  *	@mask: mask feature set
5680  *
5681  *	Computes a new feature set after adding a device with feature set
5682  *	@one to the master device with current feature set @all.  Will not
5683  *	enable anything that is off in @mask. Returns the new feature set.
5684  */
5685 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5686 					unsigned long mask)
5687 {
5688 	/* If device needs checksumming, downgrade to it. */
5689 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5690 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5691 	else if (mask & NETIF_F_ALL_CSUM) {
5692 		/* If one device supports v4/v6 checksumming, set for all. */
5693 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5694 		    !(all & NETIF_F_GEN_CSUM)) {
5695 			all &= ~NETIF_F_ALL_CSUM;
5696 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5697 		}
5698 
5699 		/* If one device supports hw checksumming, set for all. */
5700 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5701 			all &= ~NETIF_F_ALL_CSUM;
5702 			all |= NETIF_F_HW_CSUM;
5703 		}
5704 	}
5705 
5706 	one |= NETIF_F_ALL_CSUM;
5707 
5708 	one |= all & NETIF_F_ONE_FOR_ALL;
5709 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5710 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5711 
5712 	return all;
5713 }
5714 EXPORT_SYMBOL(netdev_increment_features);
5715 
5716 static struct hlist_head *netdev_create_hash(void)
5717 {
5718 	int i;
5719 	struct hlist_head *hash;
5720 
5721 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5722 	if (hash != NULL)
5723 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5724 			INIT_HLIST_HEAD(&hash[i]);
5725 
5726 	return hash;
5727 }
5728 
5729 /* Initialize per network namespace state */
5730 static int __net_init netdev_init(struct net *net)
5731 {
5732 	INIT_LIST_HEAD(&net->dev_base_head);
5733 
5734 	net->dev_name_head = netdev_create_hash();
5735 	if (net->dev_name_head == NULL)
5736 		goto err_name;
5737 
5738 	net->dev_index_head = netdev_create_hash();
5739 	if (net->dev_index_head == NULL)
5740 		goto err_idx;
5741 
5742 	return 0;
5743 
5744 err_idx:
5745 	kfree(net->dev_name_head);
5746 err_name:
5747 	return -ENOMEM;
5748 }
5749 
5750 /**
5751  *	netdev_drivername - network driver for the device
5752  *	@dev: network device
5753  *	@buffer: buffer for resulting name
5754  *	@len: size of buffer
5755  *
5756  *	Determine network driver for device.
5757  */
5758 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5759 {
5760 	const struct device_driver *driver;
5761 	const struct device *parent;
5762 
5763 	if (len <= 0 || !buffer)
5764 		return buffer;
5765 	buffer[0] = 0;
5766 
5767 	parent = dev->dev.parent;
5768 
5769 	if (!parent)
5770 		return buffer;
5771 
5772 	driver = parent->driver;
5773 	if (driver && driver->name)
5774 		strlcpy(buffer, driver->name, len);
5775 	return buffer;
5776 }
5777 
5778 static void __net_exit netdev_exit(struct net *net)
5779 {
5780 	kfree(net->dev_name_head);
5781 	kfree(net->dev_index_head);
5782 }
5783 
5784 static struct pernet_operations __net_initdata netdev_net_ops = {
5785 	.init = netdev_init,
5786 	.exit = netdev_exit,
5787 };
5788 
5789 static void __net_exit default_device_exit(struct net *net)
5790 {
5791 	struct net_device *dev, *aux;
5792 	/*
5793 	 * Push all migratable network devices back to the
5794 	 * initial network namespace
5795 	 */
5796 	rtnl_lock();
5797 	for_each_netdev_safe(net, dev, aux) {
5798 		int err;
5799 		char fb_name[IFNAMSIZ];
5800 
5801 		/* Ignore unmoveable devices (i.e. loopback) */
5802 		if (dev->features & NETIF_F_NETNS_LOCAL)
5803 			continue;
5804 
5805 		/* Leave virtual devices for the generic cleanup */
5806 		if (dev->rtnl_link_ops)
5807 			continue;
5808 
5809 		/* Push remaing network devices to init_net */
5810 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5811 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5812 		if (err) {
5813 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5814 				__func__, dev->name, err);
5815 			BUG();
5816 		}
5817 	}
5818 	rtnl_unlock();
5819 }
5820 
5821 static void __net_exit default_device_exit_batch(struct list_head *net_list)
5822 {
5823 	/* At exit all network devices most be removed from a network
5824 	 * namespace.  Do this in the reverse order of registeration.
5825 	 * Do this across as many network namespaces as possible to
5826 	 * improve batching efficiency.
5827 	 */
5828 	struct net_device *dev;
5829 	struct net *net;
5830 	LIST_HEAD(dev_kill_list);
5831 
5832 	rtnl_lock();
5833 	list_for_each_entry(net, net_list, exit_list) {
5834 		for_each_netdev_reverse(net, dev) {
5835 			if (dev->rtnl_link_ops)
5836 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5837 			else
5838 				unregister_netdevice_queue(dev, &dev_kill_list);
5839 		}
5840 	}
5841 	unregister_netdevice_many(&dev_kill_list);
5842 	rtnl_unlock();
5843 }
5844 
5845 static struct pernet_operations __net_initdata default_device_ops = {
5846 	.exit = default_device_exit,
5847 	.exit_batch = default_device_exit_batch,
5848 };
5849 
5850 /*
5851  *	Initialize the DEV module. At boot time this walks the device list and
5852  *	unhooks any devices that fail to initialise (normally hardware not
5853  *	present) and leaves us with a valid list of present and active devices.
5854  *
5855  */
5856 
5857 /*
5858  *       This is called single threaded during boot, so no need
5859  *       to take the rtnl semaphore.
5860  */
5861 static int __init net_dev_init(void)
5862 {
5863 	int i, rc = -ENOMEM;
5864 
5865 	BUG_ON(!dev_boot_phase);
5866 
5867 	if (dev_proc_init())
5868 		goto out;
5869 
5870 	if (netdev_kobject_init())
5871 		goto out;
5872 
5873 	INIT_LIST_HEAD(&ptype_all);
5874 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5875 		INIT_LIST_HEAD(&ptype_base[i]);
5876 
5877 	if (register_pernet_subsys(&netdev_net_ops))
5878 		goto out;
5879 
5880 	/*
5881 	 *	Initialise the packet receive queues.
5882 	 */
5883 
5884 	for_each_possible_cpu(i) {
5885 		struct softnet_data *sd = &per_cpu(softnet_data, i);
5886 
5887 		memset(sd, 0, sizeof(*sd));
5888 		skb_queue_head_init(&sd->input_pkt_queue);
5889 		skb_queue_head_init(&sd->process_queue);
5890 		sd->completion_queue = NULL;
5891 		INIT_LIST_HEAD(&sd->poll_list);
5892 		sd->output_queue = NULL;
5893 		sd->output_queue_tailp = &sd->output_queue;
5894 #ifdef CONFIG_RPS
5895 		sd->csd.func = rps_trigger_softirq;
5896 		sd->csd.info = sd;
5897 		sd->csd.flags = 0;
5898 		sd->cpu = i;
5899 #endif
5900 
5901 		sd->backlog.poll = process_backlog;
5902 		sd->backlog.weight = weight_p;
5903 		sd->backlog.gro_list = NULL;
5904 		sd->backlog.gro_count = 0;
5905 	}
5906 
5907 	dev_boot_phase = 0;
5908 
5909 	/* The loopback device is special if any other network devices
5910 	 * is present in a network namespace the loopback device must
5911 	 * be present. Since we now dynamically allocate and free the
5912 	 * loopback device ensure this invariant is maintained by
5913 	 * keeping the loopback device as the first device on the
5914 	 * list of network devices.  Ensuring the loopback devices
5915 	 * is the first device that appears and the last network device
5916 	 * that disappears.
5917 	 */
5918 	if (register_pernet_device(&loopback_net_ops))
5919 		goto out;
5920 
5921 	if (register_pernet_device(&default_device_ops))
5922 		goto out;
5923 
5924 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5925 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5926 
5927 	hotcpu_notifier(dev_cpu_callback, 0);
5928 	dst_init();
5929 	dev_mcast_init();
5930 	rc = 0;
5931 out:
5932 	return rc;
5933 }
5934 
5935 subsys_initcall(net_dev_init);
5936 
5937 static int __init initialize_hashrnd(void)
5938 {
5939 	get_random_bytes(&hashrnd, sizeof(hashrnd));
5940 	return 0;
5941 }
5942 
5943 late_initcall_sync(initialize_hashrnd);
5944 
5945