xref: /linux/net/core/dev.c (revision 12871a0bd67dd4db4418e1daafcd46e9d329ef10)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 
137 #include "net-sysfs.h"
138 
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141 
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 
145 /*
146  *	The list of packet types we will receive (as opposed to discard)
147  *	and the routines to invoke.
148  *
149  *	Why 16. Because with 16 the only overlap we get on a hash of the
150  *	low nibble of the protocol value is RARP/SNAP/X.25.
151  *
152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153  *             sure which should go first, but I bet it won't make much
154  *             difference if we are running VLANs.  The good news is that
155  *             this protocol won't be in the list unless compiled in, so
156  *             the average user (w/out VLANs) will not be adversely affected.
157  *             --BLG
158  *
159  *		0800	IP
160  *		8100    802.1Q VLAN
161  *		0001	802.3
162  *		0002	AX.25
163  *		0004	802.2
164  *		8035	RARP
165  *		0005	SNAP
166  *		0805	X.25
167  *		0806	ARP
168  *		8137	IPX
169  *		0009	Localtalk
170  *		86DD	IPv6
171  */
172 
173 #define PTYPE_HASH_SIZE	(16)
174 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175 
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly;	/* Taps */
179 
180 /*
181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182  * semaphore.
183  *
184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185  *
186  * Writers must hold the rtnl semaphore while they loop through the
187  * dev_base_head list, and hold dev_base_lock for writing when they do the
188  * actual updates.  This allows pure readers to access the list even
189  * while a writer is preparing to update it.
190  *
191  * To put it another way, dev_base_lock is held for writing only to
192  * protect against pure readers; the rtnl semaphore provides the
193  * protection against other writers.
194  *
195  * See, for example usages, register_netdevice() and
196  * unregister_netdevice(), which must be called with the rtnl
197  * semaphore held.
198  */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201 
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203 {
204 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 }
207 
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209 {
210 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 }
212 
213 static inline void rps_lock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216 	spin_lock(&sd->input_pkt_queue.lock);
217 #endif
218 }
219 
220 static inline void rps_unlock(struct softnet_data *sd)
221 {
222 #ifdef CONFIG_RPS
223 	spin_unlock(&sd->input_pkt_queue.lock);
224 #endif
225 }
226 
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
229 {
230 	struct net *net = dev_net(dev);
231 
232 	ASSERT_RTNL();
233 
234 	write_lock_bh(&dev_base_lock);
235 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 	hlist_add_head_rcu(&dev->index_hlist,
238 			   dev_index_hash(net, dev->ifindex));
239 	write_unlock_bh(&dev_base_lock);
240 	return 0;
241 }
242 
243 /* Device list removal
244  * caller must respect a RCU grace period before freeing/reusing dev
245  */
246 static void unlist_netdevice(struct net_device *dev)
247 {
248 	ASSERT_RTNL();
249 
250 	/* Unlink dev from the device chain */
251 	write_lock_bh(&dev_base_lock);
252 	list_del_rcu(&dev->dev_list);
253 	hlist_del_rcu(&dev->name_hlist);
254 	hlist_del_rcu(&dev->index_hlist);
255 	write_unlock_bh(&dev_base_lock);
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 	 ARPHRD_VOID, ARPHRD_NONE};
294 
295 static const char *const netdev_lock_name[] =
296 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 	 "_xmit_VOID", "_xmit_NONE"};
312 
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315 
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317 {
318 	int i;
319 
320 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 		if (netdev_lock_type[i] == dev_type)
322 			return i;
323 	/* the last key is used by default */
324 	return ARRAY_SIZE(netdev_lock_type) - 1;
325 }
326 
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 						 unsigned short dev_type)
329 {
330 	int i;
331 
332 	i = netdev_lock_pos(dev_type);
333 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 				   netdev_lock_name[i]);
335 }
336 
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338 {
339 	int i;
340 
341 	i = netdev_lock_pos(dev->type);
342 	lockdep_set_class_and_name(&dev->addr_list_lock,
343 				   &netdev_addr_lock_key[i],
344 				   netdev_lock_name[i]);
345 }
346 #else
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 						 unsigned short dev_type)
349 {
350 }
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 }
354 #endif
355 
356 /*******************************************************************************
357 
358 		Protocol management and registration routines
359 
360 *******************************************************************************/
361 
362 /*
363  *	Add a protocol ID to the list. Now that the input handler is
364  *	smarter we can dispense with all the messy stuff that used to be
365  *	here.
366  *
367  *	BEWARE!!! Protocol handlers, mangling input packets,
368  *	MUST BE last in hash buckets and checking protocol handlers
369  *	MUST start from promiscuous ptype_all chain in net_bh.
370  *	It is true now, do not change it.
371  *	Explanation follows: if protocol handler, mangling packet, will
372  *	be the first on list, it is not able to sense, that packet
373  *	is cloned and should be copied-on-write, so that it will
374  *	change it and subsequent readers will get broken packet.
375  *							--ANK (980803)
376  */
377 
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 {
380 	if (pt->type == htons(ETH_P_ALL))
381 		return &ptype_all;
382 	else
383 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385 
386 /**
387  *	dev_add_pack - add packet handler
388  *	@pt: packet type declaration
389  *
390  *	Add a protocol handler to the networking stack. The passed &packet_type
391  *	is linked into kernel lists and may not be freed until it has been
392  *	removed from the kernel lists.
393  *
394  *	This call does not sleep therefore it can not
395  *	guarantee all CPU's that are in middle of receiving packets
396  *	will see the new packet type (until the next received packet).
397  */
398 
399 void dev_add_pack(struct packet_type *pt)
400 {
401 	struct list_head *head = ptype_head(pt);
402 
403 	spin_lock(&ptype_lock);
404 	list_add_rcu(&pt->list, head);
405 	spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408 
409 /**
410  *	__dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *      The packet type might still be in use by receivers
419  *	and must not be freed until after all the CPU's have gone
420  *	through a quiescent state.
421  */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 	struct list_head *head = ptype_head(pt);
425 	struct packet_type *pt1;
426 
427 	spin_lock(&ptype_lock);
428 
429 	list_for_each_entry(pt1, head, list) {
430 		if (pt == pt1) {
431 			list_del_rcu(&pt->list);
432 			goto out;
433 		}
434 	}
435 
436 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 	spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441 
442 /**
443  *	dev_remove_pack	 - remove packet handler
444  *	@pt: packet type declaration
445  *
446  *	Remove a protocol handler that was previously added to the kernel
447  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448  *	from the kernel lists and can be freed or reused once this function
449  *	returns.
450  *
451  *	This call sleeps to guarantee that no CPU is looking at the packet
452  *	type after return.
453  */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 	__dev_remove_pack(pt);
457 
458 	synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461 
462 /******************************************************************************
463 
464 		      Device Boot-time Settings Routines
465 
466 *******************************************************************************/
467 
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470 
471 /**
472  *	netdev_boot_setup_add	- add new setup entry
473  *	@name: name of the device
474  *	@map: configured settings for the device
475  *
476  *	Adds new setup entry to the dev_boot_setup list.  The function
477  *	returns 0 on error and 1 on success.  This is a generic routine to
478  *	all netdevices.
479  */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 	struct netdev_boot_setup *s;
483 	int i;
484 
485 	s = dev_boot_setup;
486 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 			memset(s[i].name, 0, sizeof(s[i].name));
489 			strlcpy(s[i].name, name, IFNAMSIZ);
490 			memcpy(&s[i].map, map, sizeof(s[i].map));
491 			break;
492 		}
493 	}
494 
495 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497 
498 /**
499  *	netdev_boot_setup_check	- check boot time settings
500  *	@dev: the netdevice
501  *
502  * 	Check boot time settings for the device.
503  *	The found settings are set for the device to be used
504  *	later in the device probing.
505  *	Returns 0 if no settings found, 1 if they are.
506  */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 	struct netdev_boot_setup *s = dev_boot_setup;
510 	int i;
511 
512 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 		    !strcmp(dev->name, s[i].name)) {
515 			dev->irq 	= s[i].map.irq;
516 			dev->base_addr 	= s[i].map.base_addr;
517 			dev->mem_start 	= s[i].map.mem_start;
518 			dev->mem_end 	= s[i].map.mem_end;
519 			return 1;
520 		}
521 	}
522 	return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525 
526 
527 /**
528  *	netdev_boot_base	- get address from boot time settings
529  *	@prefix: prefix for network device
530  *	@unit: id for network device
531  *
532  * 	Check boot time settings for the base address of device.
533  *	The found settings are set for the device to be used
534  *	later in the device probing.
535  *	Returns 0 if no settings found.
536  */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 	const struct netdev_boot_setup *s = dev_boot_setup;
540 	char name[IFNAMSIZ];
541 	int i;
542 
543 	sprintf(name, "%s%d", prefix, unit);
544 
545 	/*
546 	 * If device already registered then return base of 1
547 	 * to indicate not to probe for this interface
548 	 */
549 	if (__dev_get_by_name(&init_net, name))
550 		return 1;
551 
552 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 		if (!strcmp(name, s[i].name))
554 			return s[i].map.base_addr;
555 	return 0;
556 }
557 
558 /*
559  * Saves at boot time configured settings for any netdevice.
560  */
561 int __init netdev_boot_setup(char *str)
562 {
563 	int ints[5];
564 	struct ifmap map;
565 
566 	str = get_options(str, ARRAY_SIZE(ints), ints);
567 	if (!str || !*str)
568 		return 0;
569 
570 	/* Save settings */
571 	memset(&map, 0, sizeof(map));
572 	if (ints[0] > 0)
573 		map.irq = ints[1];
574 	if (ints[0] > 1)
575 		map.base_addr = ints[2];
576 	if (ints[0] > 2)
577 		map.mem_start = ints[3];
578 	if (ints[0] > 3)
579 		map.mem_end = ints[4];
580 
581 	/* Add new entry to the list */
582 	return netdev_boot_setup_add(str, &map);
583 }
584 
585 __setup("netdev=", netdev_boot_setup);
586 
587 /*******************************************************************************
588 
589 			    Device Interface Subroutines
590 
591 *******************************************************************************/
592 
593 /**
594  *	__dev_get_by_name	- find a device by its name
595  *	@net: the applicable net namespace
596  *	@name: name to find
597  *
598  *	Find an interface by name. Must be called under RTNL semaphore
599  *	or @dev_base_lock. If the name is found a pointer to the device
600  *	is returned. If the name is not found then %NULL is returned. The
601  *	reference counters are not incremented so the caller must be
602  *	careful with locks.
603  */
604 
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 	struct hlist_node *p;
608 	struct net_device *dev;
609 	struct hlist_head *head = dev_name_hash(net, name);
610 
611 	hlist_for_each_entry(dev, p, head, name_hlist)
612 		if (!strncmp(dev->name, name, IFNAMSIZ))
613 			return dev;
614 
615 	return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618 
619 /**
620  *	dev_get_by_name_rcu	- find a device by its name
621  *	@net: the applicable net namespace
622  *	@name: name to find
623  *
624  *	Find an interface by name.
625  *	If the name is found a pointer to the device is returned.
626  * 	If the name is not found then %NULL is returned.
627  *	The reference counters are not incremented so the caller must be
628  *	careful with locks. The caller must hold RCU lock.
629  */
630 
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 	struct hlist_node *p;
634 	struct net_device *dev;
635 	struct hlist_head *head = dev_name_hash(net, name);
636 
637 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 		if (!strncmp(dev->name, name, IFNAMSIZ))
639 			return dev;
640 
641 	return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644 
645 /**
646  *	dev_get_by_name		- find a device by its name
647  *	@net: the applicable net namespace
648  *	@name: name to find
649  *
650  *	Find an interface by name. This can be called from any
651  *	context and does its own locking. The returned handle has
652  *	the usage count incremented and the caller must use dev_put() to
653  *	release it when it is no longer needed. %NULL is returned if no
654  *	matching device is found.
655  */
656 
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 	struct net_device *dev;
660 
661 	rcu_read_lock();
662 	dev = dev_get_by_name_rcu(net, name);
663 	if (dev)
664 		dev_hold(dev);
665 	rcu_read_unlock();
666 	return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669 
670 /**
671  *	__dev_get_by_index - find a device by its ifindex
672  *	@net: the applicable net namespace
673  *	@ifindex: index of device
674  *
675  *	Search for an interface by index. Returns %NULL if the device
676  *	is not found or a pointer to the device. The device has not
677  *	had its reference counter increased so the caller must be careful
678  *	about locking. The caller must hold either the RTNL semaphore
679  *	or @dev_base_lock.
680  */
681 
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 	struct hlist_node *p;
685 	struct net_device *dev;
686 	struct hlist_head *head = dev_index_hash(net, ifindex);
687 
688 	hlist_for_each_entry(dev, p, head, index_hlist)
689 		if (dev->ifindex == ifindex)
690 			return dev;
691 
692 	return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695 
696 /**
697  *	dev_get_by_index_rcu - find a device by its ifindex
698  *	@net: the applicable net namespace
699  *	@ifindex: index of device
700  *
701  *	Search for an interface by index. Returns %NULL if the device
702  *	is not found or a pointer to the device. The device has not
703  *	had its reference counter increased so the caller must be careful
704  *	about locking. The caller must hold RCU lock.
705  */
706 
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 	struct hlist_node *p;
710 	struct net_device *dev;
711 	struct hlist_head *head = dev_index_hash(net, ifindex);
712 
713 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 		if (dev->ifindex == ifindex)
715 			return dev;
716 
717 	return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720 
721 
722 /**
723  *	dev_get_by_index - find a device by its ifindex
724  *	@net: the applicable net namespace
725  *	@ifindex: index of device
726  *
727  *	Search for an interface by index. Returns NULL if the device
728  *	is not found or a pointer to the device. The device returned has
729  *	had a reference added and the pointer is safe until the user calls
730  *	dev_put to indicate they have finished with it.
731  */
732 
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 	struct net_device *dev;
736 
737 	rcu_read_lock();
738 	dev = dev_get_by_index_rcu(net, ifindex);
739 	if (dev)
740 		dev_hold(dev);
741 	rcu_read_unlock();
742 	return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745 
746 /**
747  *	dev_getbyhwaddr_rcu - find a device by its hardware address
748  *	@net: the applicable net namespace
749  *	@type: media type of device
750  *	@ha: hardware address
751  *
752  *	Search for an interface by MAC address. Returns NULL if the device
753  *	is not found or a pointer to the device.
754  *	The caller must hold RCU or RTNL.
755  *	The returned device has not had its ref count increased
756  *	and the caller must therefore be careful about locking
757  *
758  */
759 
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 				       const char *ha)
762 {
763 	struct net_device *dev;
764 
765 	for_each_netdev_rcu(net, dev)
766 		if (dev->type == type &&
767 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
768 			return dev;
769 
770 	return NULL;
771 }
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773 
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775 {
776 	struct net_device *dev;
777 
778 	ASSERT_RTNL();
779 	for_each_netdev(net, dev)
780 		if (dev->type == type)
781 			return dev;
782 
783 	return NULL;
784 }
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786 
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788 {
789 	struct net_device *dev, *ret = NULL;
790 
791 	rcu_read_lock();
792 	for_each_netdev_rcu(net, dev)
793 		if (dev->type == type) {
794 			dev_hold(dev);
795 			ret = dev;
796 			break;
797 		}
798 	rcu_read_unlock();
799 	return ret;
800 }
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
802 
803 /**
804  *	dev_get_by_flags_rcu - find any device with given flags
805  *	@net: the applicable net namespace
806  *	@if_flags: IFF_* values
807  *	@mask: bitmask of bits in if_flags to check
808  *
809  *	Search for any interface with the given flags. Returns NULL if a device
810  *	is not found or a pointer to the device. Must be called inside
811  *	rcu_read_lock(), and result refcount is unchanged.
812  */
813 
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815 				    unsigned short mask)
816 {
817 	struct net_device *dev, *ret;
818 
819 	ret = NULL;
820 	for_each_netdev_rcu(net, dev) {
821 		if (((dev->flags ^ if_flags) & mask) == 0) {
822 			ret = dev;
823 			break;
824 		}
825 	}
826 	return ret;
827 }
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
829 
830 /**
831  *	dev_valid_name - check if name is okay for network device
832  *	@name: name string
833  *
834  *	Network device names need to be valid file names to
835  *	to allow sysfs to work.  We also disallow any kind of
836  *	whitespace.
837  */
838 int dev_valid_name(const char *name)
839 {
840 	if (*name == '\0')
841 		return 0;
842 	if (strlen(name) >= IFNAMSIZ)
843 		return 0;
844 	if (!strcmp(name, ".") || !strcmp(name, ".."))
845 		return 0;
846 
847 	while (*name) {
848 		if (*name == '/' || isspace(*name))
849 			return 0;
850 		name++;
851 	}
852 	return 1;
853 }
854 EXPORT_SYMBOL(dev_valid_name);
855 
856 /**
857  *	__dev_alloc_name - allocate a name for a device
858  *	@net: network namespace to allocate the device name in
859  *	@name: name format string
860  *	@buf:  scratch buffer and result name string
861  *
862  *	Passed a format string - eg "lt%d" it will try and find a suitable
863  *	id. It scans list of devices to build up a free map, then chooses
864  *	the first empty slot. The caller must hold the dev_base or rtnl lock
865  *	while allocating the name and adding the device in order to avoid
866  *	duplicates.
867  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868  *	Returns the number of the unit assigned or a negative errno code.
869  */
870 
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872 {
873 	int i = 0;
874 	const char *p;
875 	const int max_netdevices = 8*PAGE_SIZE;
876 	unsigned long *inuse;
877 	struct net_device *d;
878 
879 	p = strnchr(name, IFNAMSIZ-1, '%');
880 	if (p) {
881 		/*
882 		 * Verify the string as this thing may have come from
883 		 * the user.  There must be either one "%d" and no other "%"
884 		 * characters.
885 		 */
886 		if (p[1] != 'd' || strchr(p + 2, '%'))
887 			return -EINVAL;
888 
889 		/* Use one page as a bit array of possible slots */
890 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891 		if (!inuse)
892 			return -ENOMEM;
893 
894 		for_each_netdev(net, d) {
895 			if (!sscanf(d->name, name, &i))
896 				continue;
897 			if (i < 0 || i >= max_netdevices)
898 				continue;
899 
900 			/*  avoid cases where sscanf is not exact inverse of printf */
901 			snprintf(buf, IFNAMSIZ, name, i);
902 			if (!strncmp(buf, d->name, IFNAMSIZ))
903 				set_bit(i, inuse);
904 		}
905 
906 		i = find_first_zero_bit(inuse, max_netdevices);
907 		free_page((unsigned long) inuse);
908 	}
909 
910 	if (buf != name)
911 		snprintf(buf, IFNAMSIZ, name, i);
912 	if (!__dev_get_by_name(net, buf))
913 		return i;
914 
915 	/* It is possible to run out of possible slots
916 	 * when the name is long and there isn't enough space left
917 	 * for the digits, or if all bits are used.
918 	 */
919 	return -ENFILE;
920 }
921 
922 /**
923  *	dev_alloc_name - allocate a name for a device
924  *	@dev: device
925  *	@name: name format string
926  *
927  *	Passed a format string - eg "lt%d" it will try and find a suitable
928  *	id. It scans list of devices to build up a free map, then chooses
929  *	the first empty slot. The caller must hold the dev_base or rtnl lock
930  *	while allocating the name and adding the device in order to avoid
931  *	duplicates.
932  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933  *	Returns the number of the unit assigned or a negative errno code.
934  */
935 
936 int dev_alloc_name(struct net_device *dev, const char *name)
937 {
938 	char buf[IFNAMSIZ];
939 	struct net *net;
940 	int ret;
941 
942 	BUG_ON(!dev_net(dev));
943 	net = dev_net(dev);
944 	ret = __dev_alloc_name(net, name, buf);
945 	if (ret >= 0)
946 		strlcpy(dev->name, buf, IFNAMSIZ);
947 	return ret;
948 }
949 EXPORT_SYMBOL(dev_alloc_name);
950 
951 static int dev_get_valid_name(struct net_device *dev, const char *name)
952 {
953 	struct net *net;
954 
955 	BUG_ON(!dev_net(dev));
956 	net = dev_net(dev);
957 
958 	if (!dev_valid_name(name))
959 		return -EINVAL;
960 
961 	if (strchr(name, '%'))
962 		return dev_alloc_name(dev, name);
963 	else if (__dev_get_by_name(net, name))
964 		return -EEXIST;
965 	else if (dev->name != name)
966 		strlcpy(dev->name, name, IFNAMSIZ);
967 
968 	return 0;
969 }
970 
971 /**
972  *	dev_change_name - change name of a device
973  *	@dev: device
974  *	@newname: name (or format string) must be at least IFNAMSIZ
975  *
976  *	Change name of a device, can pass format strings "eth%d".
977  *	for wildcarding.
978  */
979 int dev_change_name(struct net_device *dev, const char *newname)
980 {
981 	char oldname[IFNAMSIZ];
982 	int err = 0;
983 	int ret;
984 	struct net *net;
985 
986 	ASSERT_RTNL();
987 	BUG_ON(!dev_net(dev));
988 
989 	net = dev_net(dev);
990 	if (dev->flags & IFF_UP)
991 		return -EBUSY;
992 
993 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994 		return 0;
995 
996 	memcpy(oldname, dev->name, IFNAMSIZ);
997 
998 	err = dev_get_valid_name(dev, newname);
999 	if (err < 0)
1000 		return err;
1001 
1002 rollback:
1003 	ret = device_rename(&dev->dev, dev->name);
1004 	if (ret) {
1005 		memcpy(dev->name, oldname, IFNAMSIZ);
1006 		return ret;
1007 	}
1008 
1009 	write_lock_bh(&dev_base_lock);
1010 	hlist_del_rcu(&dev->name_hlist);
1011 	write_unlock_bh(&dev_base_lock);
1012 
1013 	synchronize_rcu();
1014 
1015 	write_lock_bh(&dev_base_lock);
1016 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 	write_unlock_bh(&dev_base_lock);
1018 
1019 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 	ret = notifier_to_errno(ret);
1021 
1022 	if (ret) {
1023 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1024 		if (err >= 0) {
1025 			err = ret;
1026 			memcpy(dev->name, oldname, IFNAMSIZ);
1027 			goto rollback;
1028 		} else {
1029 			printk(KERN_ERR
1030 			       "%s: name change rollback failed: %d.\n",
1031 			       dev->name, ret);
1032 		}
1033 	}
1034 
1035 	return err;
1036 }
1037 
1038 /**
1039  *	dev_set_alias - change ifalias of a device
1040  *	@dev: device
1041  *	@alias: name up to IFALIASZ
1042  *	@len: limit of bytes to copy from info
1043  *
1044  *	Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048 	ASSERT_RTNL();
1049 
1050 	if (len >= IFALIASZ)
1051 		return -EINVAL;
1052 
1053 	if (!len) {
1054 		if (dev->ifalias) {
1055 			kfree(dev->ifalias);
1056 			dev->ifalias = NULL;
1057 		}
1058 		return 0;
1059 	}
1060 
1061 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062 	if (!dev->ifalias)
1063 		return -ENOMEM;
1064 
1065 	strlcpy(dev->ifalias, alias, len+1);
1066 	return len;
1067 }
1068 
1069 
1070 /**
1071  *	netdev_features_change - device changes features
1072  *	@dev: device to cause notification
1073  *
1074  *	Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081 
1082 /**
1083  *	netdev_state_change - device changes state
1084  *	@dev: device to cause notification
1085  *
1086  *	Called to indicate a device has changed state. This function calls
1087  *	the notifier chains for netdev_chain and sends a NEWLINK message
1088  *	to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092 	if (dev->flags & IFF_UP) {
1093 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095 	}
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098 
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101 	return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104 
1105 /**
1106  *	dev_load 	- load a network module
1107  *	@net: the applicable net namespace
1108  *	@name: name of interface
1109  *
1110  *	If a network interface is not present and the process has suitable
1111  *	privileges this function loads the module. If module loading is not
1112  *	available in this kernel then it becomes a nop.
1113  */
1114 
1115 void dev_load(struct net *net, const char *name)
1116 {
1117 	struct net_device *dev;
1118 	int no_module;
1119 
1120 	rcu_read_lock();
1121 	dev = dev_get_by_name_rcu(net, name);
1122 	rcu_read_unlock();
1123 
1124 	no_module = !dev;
1125 	if (no_module && capable(CAP_NET_ADMIN))
1126 		no_module = request_module("netdev-%s", name);
1127 	if (no_module && capable(CAP_SYS_MODULE)) {
1128 		if (!request_module("%s", name))
1129 			pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132 	}
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135 
1136 static int __dev_open(struct net_device *dev)
1137 {
1138 	const struct net_device_ops *ops = dev->netdev_ops;
1139 	int ret;
1140 
1141 	ASSERT_RTNL();
1142 
1143 	if (!netif_device_present(dev))
1144 		return -ENODEV;
1145 
1146 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147 	ret = notifier_to_errno(ret);
1148 	if (ret)
1149 		return ret;
1150 
1151 	set_bit(__LINK_STATE_START, &dev->state);
1152 
1153 	if (ops->ndo_validate_addr)
1154 		ret = ops->ndo_validate_addr(dev);
1155 
1156 	if (!ret && ops->ndo_open)
1157 		ret = ops->ndo_open(dev);
1158 
1159 	if (ret)
1160 		clear_bit(__LINK_STATE_START, &dev->state);
1161 	else {
1162 		dev->flags |= IFF_UP;
1163 		net_dmaengine_get();
1164 		dev_set_rx_mode(dev);
1165 		dev_activate(dev);
1166 	}
1167 
1168 	return ret;
1169 }
1170 
1171 /**
1172  *	dev_open	- prepare an interface for use.
1173  *	@dev:	device to open
1174  *
1175  *	Takes a device from down to up state. The device's private open
1176  *	function is invoked and then the multicast lists are loaded. Finally
1177  *	the device is moved into the up state and a %NETDEV_UP message is
1178  *	sent to the netdev notifier chain.
1179  *
1180  *	Calling this function on an active interface is a nop. On a failure
1181  *	a negative errno code is returned.
1182  */
1183 int dev_open(struct net_device *dev)
1184 {
1185 	int ret;
1186 
1187 	if (dev->flags & IFF_UP)
1188 		return 0;
1189 
1190 	ret = __dev_open(dev);
1191 	if (ret < 0)
1192 		return ret;
1193 
1194 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195 	call_netdevice_notifiers(NETDEV_UP, dev);
1196 
1197 	return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200 
1201 static int __dev_close_many(struct list_head *head)
1202 {
1203 	struct net_device *dev;
1204 
1205 	ASSERT_RTNL();
1206 	might_sleep();
1207 
1208 	list_for_each_entry(dev, head, unreg_list) {
1209 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210 
1211 		clear_bit(__LINK_STATE_START, &dev->state);
1212 
1213 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1214 		 * can be even on different cpu. So just clear netif_running().
1215 		 *
1216 		 * dev->stop() will invoke napi_disable() on all of it's
1217 		 * napi_struct instances on this device.
1218 		 */
1219 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220 	}
1221 
1222 	dev_deactivate_many(head);
1223 
1224 	list_for_each_entry(dev, head, unreg_list) {
1225 		const struct net_device_ops *ops = dev->netdev_ops;
1226 
1227 		/*
1228 		 *	Call the device specific close. This cannot fail.
1229 		 *	Only if device is UP
1230 		 *
1231 		 *	We allow it to be called even after a DETACH hot-plug
1232 		 *	event.
1233 		 */
1234 		if (ops->ndo_stop)
1235 			ops->ndo_stop(dev);
1236 
1237 		dev->flags &= ~IFF_UP;
1238 		net_dmaengine_put();
1239 	}
1240 
1241 	return 0;
1242 }
1243 
1244 static int __dev_close(struct net_device *dev)
1245 {
1246 	int retval;
1247 	LIST_HEAD(single);
1248 
1249 	list_add(&dev->unreg_list, &single);
1250 	retval = __dev_close_many(&single);
1251 	list_del(&single);
1252 	return retval;
1253 }
1254 
1255 static int dev_close_many(struct list_head *head)
1256 {
1257 	struct net_device *dev, *tmp;
1258 	LIST_HEAD(tmp_list);
1259 
1260 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261 		if (!(dev->flags & IFF_UP))
1262 			list_move(&dev->unreg_list, &tmp_list);
1263 
1264 	__dev_close_many(head);
1265 
1266 	list_for_each_entry(dev, head, unreg_list) {
1267 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1269 	}
1270 
1271 	/* rollback_registered_many needs the complete original list */
1272 	list_splice(&tmp_list, head);
1273 	return 0;
1274 }
1275 
1276 /**
1277  *	dev_close - shutdown an interface.
1278  *	@dev: device to shutdown
1279  *
1280  *	This function moves an active device into down state. A
1281  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283  *	chain.
1284  */
1285 int dev_close(struct net_device *dev)
1286 {
1287 	if (dev->flags & IFF_UP) {
1288 		LIST_HEAD(single);
1289 
1290 		list_add(&dev->unreg_list, &single);
1291 		dev_close_many(&single);
1292 		list_del(&single);
1293 	}
1294 	return 0;
1295 }
1296 EXPORT_SYMBOL(dev_close);
1297 
1298 
1299 /**
1300  *	dev_disable_lro - disable Large Receive Offload on a device
1301  *	@dev: device
1302  *
1303  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1304  *	called under RTNL.  This is needed if received packets may be
1305  *	forwarded to another interface.
1306  */
1307 void dev_disable_lro(struct net_device *dev)
1308 {
1309 	u32 flags;
1310 
1311 	/*
1312 	 * If we're trying to disable lro on a vlan device
1313 	 * use the underlying physical device instead
1314 	 */
1315 	if (is_vlan_dev(dev))
1316 		dev = vlan_dev_real_dev(dev);
1317 
1318 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1319 		flags = dev->ethtool_ops->get_flags(dev);
1320 	else
1321 		flags = ethtool_op_get_flags(dev);
1322 
1323 	if (!(flags & ETH_FLAG_LRO))
1324 		return;
1325 
1326 	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1327 	if (unlikely(dev->features & NETIF_F_LRO))
1328 		netdev_WARN(dev, "failed to disable LRO!\n");
1329 }
1330 EXPORT_SYMBOL(dev_disable_lro);
1331 
1332 
1333 static int dev_boot_phase = 1;
1334 
1335 /**
1336  *	register_netdevice_notifier - register a network notifier block
1337  *	@nb: notifier
1338  *
1339  *	Register a notifier to be called when network device events occur.
1340  *	The notifier passed is linked into the kernel structures and must
1341  *	not be reused until it has been unregistered. A negative errno code
1342  *	is returned on a failure.
1343  *
1344  * 	When registered all registration and up events are replayed
1345  *	to the new notifier to allow device to have a race free
1346  *	view of the network device list.
1347  */
1348 
1349 int register_netdevice_notifier(struct notifier_block *nb)
1350 {
1351 	struct net_device *dev;
1352 	struct net_device *last;
1353 	struct net *net;
1354 	int err;
1355 
1356 	rtnl_lock();
1357 	err = raw_notifier_chain_register(&netdev_chain, nb);
1358 	if (err)
1359 		goto unlock;
1360 	if (dev_boot_phase)
1361 		goto unlock;
1362 	for_each_net(net) {
1363 		for_each_netdev(net, dev) {
1364 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1365 			err = notifier_to_errno(err);
1366 			if (err)
1367 				goto rollback;
1368 
1369 			if (!(dev->flags & IFF_UP))
1370 				continue;
1371 
1372 			nb->notifier_call(nb, NETDEV_UP, dev);
1373 		}
1374 	}
1375 
1376 unlock:
1377 	rtnl_unlock();
1378 	return err;
1379 
1380 rollback:
1381 	last = dev;
1382 	for_each_net(net) {
1383 		for_each_netdev(net, dev) {
1384 			if (dev == last)
1385 				break;
1386 
1387 			if (dev->flags & IFF_UP) {
1388 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1389 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1390 			}
1391 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1392 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1393 		}
1394 	}
1395 
1396 	raw_notifier_chain_unregister(&netdev_chain, nb);
1397 	goto unlock;
1398 }
1399 EXPORT_SYMBOL(register_netdevice_notifier);
1400 
1401 /**
1402  *	unregister_netdevice_notifier - unregister a network notifier block
1403  *	@nb: notifier
1404  *
1405  *	Unregister a notifier previously registered by
1406  *	register_netdevice_notifier(). The notifier is unlinked into the
1407  *	kernel structures and may then be reused. A negative errno code
1408  *	is returned on a failure.
1409  */
1410 
1411 int unregister_netdevice_notifier(struct notifier_block *nb)
1412 {
1413 	int err;
1414 
1415 	rtnl_lock();
1416 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1417 	rtnl_unlock();
1418 	return err;
1419 }
1420 EXPORT_SYMBOL(unregister_netdevice_notifier);
1421 
1422 /**
1423  *	call_netdevice_notifiers - call all network notifier blocks
1424  *      @val: value passed unmodified to notifier function
1425  *      @dev: net_device pointer passed unmodified to notifier function
1426  *
1427  *	Call all network notifier blocks.  Parameters and return value
1428  *	are as for raw_notifier_call_chain().
1429  */
1430 
1431 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1432 {
1433 	ASSERT_RTNL();
1434 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1435 }
1436 EXPORT_SYMBOL(call_netdevice_notifiers);
1437 
1438 /* When > 0 there are consumers of rx skb time stamps */
1439 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1440 
1441 void net_enable_timestamp(void)
1442 {
1443 	atomic_inc(&netstamp_needed);
1444 }
1445 EXPORT_SYMBOL(net_enable_timestamp);
1446 
1447 void net_disable_timestamp(void)
1448 {
1449 	atomic_dec(&netstamp_needed);
1450 }
1451 EXPORT_SYMBOL(net_disable_timestamp);
1452 
1453 static inline void net_timestamp_set(struct sk_buff *skb)
1454 {
1455 	if (atomic_read(&netstamp_needed))
1456 		__net_timestamp(skb);
1457 	else
1458 		skb->tstamp.tv64 = 0;
1459 }
1460 
1461 static inline void net_timestamp_check(struct sk_buff *skb)
1462 {
1463 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1464 		__net_timestamp(skb);
1465 }
1466 
1467 static inline bool is_skb_forwardable(struct net_device *dev,
1468 				      struct sk_buff *skb)
1469 {
1470 	unsigned int len;
1471 
1472 	if (!(dev->flags & IFF_UP))
1473 		return false;
1474 
1475 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1476 	if (skb->len <= len)
1477 		return true;
1478 
1479 	/* if TSO is enabled, we don't care about the length as the packet
1480 	 * could be forwarded without being segmented before
1481 	 */
1482 	if (skb_is_gso(skb))
1483 		return true;
1484 
1485 	return false;
1486 }
1487 
1488 /**
1489  * dev_forward_skb - loopback an skb to another netif
1490  *
1491  * @dev: destination network device
1492  * @skb: buffer to forward
1493  *
1494  * return values:
1495  *	NET_RX_SUCCESS	(no congestion)
1496  *	NET_RX_DROP     (packet was dropped, but freed)
1497  *
1498  * dev_forward_skb can be used for injecting an skb from the
1499  * start_xmit function of one device into the receive queue
1500  * of another device.
1501  *
1502  * The receiving device may be in another namespace, so
1503  * we have to clear all information in the skb that could
1504  * impact namespace isolation.
1505  */
1506 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1507 {
1508 	skb_orphan(skb);
1509 	nf_reset(skb);
1510 
1511 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1512 		atomic_long_inc(&dev->rx_dropped);
1513 		kfree_skb(skb);
1514 		return NET_RX_DROP;
1515 	}
1516 	skb_set_dev(skb, dev);
1517 	skb->tstamp.tv64 = 0;
1518 	skb->pkt_type = PACKET_HOST;
1519 	skb->protocol = eth_type_trans(skb, dev);
1520 	return netif_rx(skb);
1521 }
1522 EXPORT_SYMBOL_GPL(dev_forward_skb);
1523 
1524 static inline int deliver_skb(struct sk_buff *skb,
1525 			      struct packet_type *pt_prev,
1526 			      struct net_device *orig_dev)
1527 {
1528 	atomic_inc(&skb->users);
1529 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1530 }
1531 
1532 /*
1533  *	Support routine. Sends outgoing frames to any network
1534  *	taps currently in use.
1535  */
1536 
1537 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1538 {
1539 	struct packet_type *ptype;
1540 	struct sk_buff *skb2 = NULL;
1541 	struct packet_type *pt_prev = NULL;
1542 
1543 	rcu_read_lock();
1544 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1545 		/* Never send packets back to the socket
1546 		 * they originated from - MvS (miquels@drinkel.ow.org)
1547 		 */
1548 		if ((ptype->dev == dev || !ptype->dev) &&
1549 		    (ptype->af_packet_priv == NULL ||
1550 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1551 			if (pt_prev) {
1552 				deliver_skb(skb2, pt_prev, skb->dev);
1553 				pt_prev = ptype;
1554 				continue;
1555 			}
1556 
1557 			skb2 = skb_clone(skb, GFP_ATOMIC);
1558 			if (!skb2)
1559 				break;
1560 
1561 			net_timestamp_set(skb2);
1562 
1563 			/* skb->nh should be correctly
1564 			   set by sender, so that the second statement is
1565 			   just protection against buggy protocols.
1566 			 */
1567 			skb_reset_mac_header(skb2);
1568 
1569 			if (skb_network_header(skb2) < skb2->data ||
1570 			    skb2->network_header > skb2->tail) {
1571 				if (net_ratelimit())
1572 					printk(KERN_CRIT "protocol %04x is "
1573 					       "buggy, dev %s\n",
1574 					       ntohs(skb2->protocol),
1575 					       dev->name);
1576 				skb_reset_network_header(skb2);
1577 			}
1578 
1579 			skb2->transport_header = skb2->network_header;
1580 			skb2->pkt_type = PACKET_OUTGOING;
1581 			pt_prev = ptype;
1582 		}
1583 	}
1584 	if (pt_prev)
1585 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1586 	rcu_read_unlock();
1587 }
1588 
1589 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1590  * @dev: Network device
1591  * @txq: number of queues available
1592  *
1593  * If real_num_tx_queues is changed the tc mappings may no longer be
1594  * valid. To resolve this verify the tc mapping remains valid and if
1595  * not NULL the mapping. With no priorities mapping to this
1596  * offset/count pair it will no longer be used. In the worst case TC0
1597  * is invalid nothing can be done so disable priority mappings. If is
1598  * expected that drivers will fix this mapping if they can before
1599  * calling netif_set_real_num_tx_queues.
1600  */
1601 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1602 {
1603 	int i;
1604 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1605 
1606 	/* If TC0 is invalidated disable TC mapping */
1607 	if (tc->offset + tc->count > txq) {
1608 		pr_warning("Number of in use tx queues changed "
1609 			   "invalidating tc mappings. Priority "
1610 			   "traffic classification disabled!\n");
1611 		dev->num_tc = 0;
1612 		return;
1613 	}
1614 
1615 	/* Invalidated prio to tc mappings set to TC0 */
1616 	for (i = 1; i < TC_BITMASK + 1; i++) {
1617 		int q = netdev_get_prio_tc_map(dev, i);
1618 
1619 		tc = &dev->tc_to_txq[q];
1620 		if (tc->offset + tc->count > txq) {
1621 			pr_warning("Number of in use tx queues "
1622 				   "changed. Priority %i to tc "
1623 				   "mapping %i is no longer valid "
1624 				   "setting map to 0\n",
1625 				   i, q);
1626 			netdev_set_prio_tc_map(dev, i, 0);
1627 		}
1628 	}
1629 }
1630 
1631 /*
1632  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1633  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1634  */
1635 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1636 {
1637 	int rc;
1638 
1639 	if (txq < 1 || txq > dev->num_tx_queues)
1640 		return -EINVAL;
1641 
1642 	if (dev->reg_state == NETREG_REGISTERED ||
1643 	    dev->reg_state == NETREG_UNREGISTERING) {
1644 		ASSERT_RTNL();
1645 
1646 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1647 						  txq);
1648 		if (rc)
1649 			return rc;
1650 
1651 		if (dev->num_tc)
1652 			netif_setup_tc(dev, txq);
1653 
1654 		if (txq < dev->real_num_tx_queues)
1655 			qdisc_reset_all_tx_gt(dev, txq);
1656 	}
1657 
1658 	dev->real_num_tx_queues = txq;
1659 	return 0;
1660 }
1661 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1662 
1663 #ifdef CONFIG_RPS
1664 /**
1665  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1666  *	@dev: Network device
1667  *	@rxq: Actual number of RX queues
1668  *
1669  *	This must be called either with the rtnl_lock held or before
1670  *	registration of the net device.  Returns 0 on success, or a
1671  *	negative error code.  If called before registration, it always
1672  *	succeeds.
1673  */
1674 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1675 {
1676 	int rc;
1677 
1678 	if (rxq < 1 || rxq > dev->num_rx_queues)
1679 		return -EINVAL;
1680 
1681 	if (dev->reg_state == NETREG_REGISTERED) {
1682 		ASSERT_RTNL();
1683 
1684 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1685 						  rxq);
1686 		if (rc)
1687 			return rc;
1688 	}
1689 
1690 	dev->real_num_rx_queues = rxq;
1691 	return 0;
1692 }
1693 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1694 #endif
1695 
1696 static inline void __netif_reschedule(struct Qdisc *q)
1697 {
1698 	struct softnet_data *sd;
1699 	unsigned long flags;
1700 
1701 	local_irq_save(flags);
1702 	sd = &__get_cpu_var(softnet_data);
1703 	q->next_sched = NULL;
1704 	*sd->output_queue_tailp = q;
1705 	sd->output_queue_tailp = &q->next_sched;
1706 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1707 	local_irq_restore(flags);
1708 }
1709 
1710 void __netif_schedule(struct Qdisc *q)
1711 {
1712 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1713 		__netif_reschedule(q);
1714 }
1715 EXPORT_SYMBOL(__netif_schedule);
1716 
1717 void dev_kfree_skb_irq(struct sk_buff *skb)
1718 {
1719 	if (atomic_dec_and_test(&skb->users)) {
1720 		struct softnet_data *sd;
1721 		unsigned long flags;
1722 
1723 		local_irq_save(flags);
1724 		sd = &__get_cpu_var(softnet_data);
1725 		skb->next = sd->completion_queue;
1726 		sd->completion_queue = skb;
1727 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1728 		local_irq_restore(flags);
1729 	}
1730 }
1731 EXPORT_SYMBOL(dev_kfree_skb_irq);
1732 
1733 void dev_kfree_skb_any(struct sk_buff *skb)
1734 {
1735 	if (in_irq() || irqs_disabled())
1736 		dev_kfree_skb_irq(skb);
1737 	else
1738 		dev_kfree_skb(skb);
1739 }
1740 EXPORT_SYMBOL(dev_kfree_skb_any);
1741 
1742 
1743 /**
1744  * netif_device_detach - mark device as removed
1745  * @dev: network device
1746  *
1747  * Mark device as removed from system and therefore no longer available.
1748  */
1749 void netif_device_detach(struct net_device *dev)
1750 {
1751 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1752 	    netif_running(dev)) {
1753 		netif_tx_stop_all_queues(dev);
1754 	}
1755 }
1756 EXPORT_SYMBOL(netif_device_detach);
1757 
1758 /**
1759  * netif_device_attach - mark device as attached
1760  * @dev: network device
1761  *
1762  * Mark device as attached from system and restart if needed.
1763  */
1764 void netif_device_attach(struct net_device *dev)
1765 {
1766 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1767 	    netif_running(dev)) {
1768 		netif_tx_wake_all_queues(dev);
1769 		__netdev_watchdog_up(dev);
1770 	}
1771 }
1772 EXPORT_SYMBOL(netif_device_attach);
1773 
1774 /**
1775  * skb_dev_set -- assign a new device to a buffer
1776  * @skb: buffer for the new device
1777  * @dev: network device
1778  *
1779  * If an skb is owned by a device already, we have to reset
1780  * all data private to the namespace a device belongs to
1781  * before assigning it a new device.
1782  */
1783 #ifdef CONFIG_NET_NS
1784 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1785 {
1786 	skb_dst_drop(skb);
1787 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1788 		secpath_reset(skb);
1789 		nf_reset(skb);
1790 		skb_init_secmark(skb);
1791 		skb->mark = 0;
1792 		skb->priority = 0;
1793 		skb->nf_trace = 0;
1794 		skb->ipvs_property = 0;
1795 #ifdef CONFIG_NET_SCHED
1796 		skb->tc_index = 0;
1797 #endif
1798 	}
1799 	skb->dev = dev;
1800 }
1801 EXPORT_SYMBOL(skb_set_dev);
1802 #endif /* CONFIG_NET_NS */
1803 
1804 /*
1805  * Invalidate hardware checksum when packet is to be mangled, and
1806  * complete checksum manually on outgoing path.
1807  */
1808 int skb_checksum_help(struct sk_buff *skb)
1809 {
1810 	__wsum csum;
1811 	int ret = 0, offset;
1812 
1813 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1814 		goto out_set_summed;
1815 
1816 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1817 		/* Let GSO fix up the checksum. */
1818 		goto out_set_summed;
1819 	}
1820 
1821 	offset = skb_checksum_start_offset(skb);
1822 	BUG_ON(offset >= skb_headlen(skb));
1823 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1824 
1825 	offset += skb->csum_offset;
1826 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1827 
1828 	if (skb_cloned(skb) &&
1829 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1830 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1831 		if (ret)
1832 			goto out;
1833 	}
1834 
1835 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1836 out_set_summed:
1837 	skb->ip_summed = CHECKSUM_NONE;
1838 out:
1839 	return ret;
1840 }
1841 EXPORT_SYMBOL(skb_checksum_help);
1842 
1843 /**
1844  *	skb_gso_segment - Perform segmentation on skb.
1845  *	@skb: buffer to segment
1846  *	@features: features for the output path (see dev->features)
1847  *
1848  *	This function segments the given skb and returns a list of segments.
1849  *
1850  *	It may return NULL if the skb requires no segmentation.  This is
1851  *	only possible when GSO is used for verifying header integrity.
1852  */
1853 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1854 {
1855 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1856 	struct packet_type *ptype;
1857 	__be16 type = skb->protocol;
1858 	int vlan_depth = ETH_HLEN;
1859 	int err;
1860 
1861 	while (type == htons(ETH_P_8021Q)) {
1862 		struct vlan_hdr *vh;
1863 
1864 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1865 			return ERR_PTR(-EINVAL);
1866 
1867 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1868 		type = vh->h_vlan_encapsulated_proto;
1869 		vlan_depth += VLAN_HLEN;
1870 	}
1871 
1872 	skb_reset_mac_header(skb);
1873 	skb->mac_len = skb->network_header - skb->mac_header;
1874 	__skb_pull(skb, skb->mac_len);
1875 
1876 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1877 		struct net_device *dev = skb->dev;
1878 		struct ethtool_drvinfo info = {};
1879 
1880 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1881 			dev->ethtool_ops->get_drvinfo(dev, &info);
1882 
1883 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1884 		     info.driver, dev ? dev->features : 0L,
1885 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1886 		     skb->len, skb->data_len, skb->ip_summed);
1887 
1888 		if (skb_header_cloned(skb) &&
1889 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1890 			return ERR_PTR(err);
1891 	}
1892 
1893 	rcu_read_lock();
1894 	list_for_each_entry_rcu(ptype,
1895 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1896 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1897 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1898 				err = ptype->gso_send_check(skb);
1899 				segs = ERR_PTR(err);
1900 				if (err || skb_gso_ok(skb, features))
1901 					break;
1902 				__skb_push(skb, (skb->data -
1903 						 skb_network_header(skb)));
1904 			}
1905 			segs = ptype->gso_segment(skb, features);
1906 			break;
1907 		}
1908 	}
1909 	rcu_read_unlock();
1910 
1911 	__skb_push(skb, skb->data - skb_mac_header(skb));
1912 
1913 	return segs;
1914 }
1915 EXPORT_SYMBOL(skb_gso_segment);
1916 
1917 /* Take action when hardware reception checksum errors are detected. */
1918 #ifdef CONFIG_BUG
1919 void netdev_rx_csum_fault(struct net_device *dev)
1920 {
1921 	if (net_ratelimit()) {
1922 		printk(KERN_ERR "%s: hw csum failure.\n",
1923 			dev ? dev->name : "<unknown>");
1924 		dump_stack();
1925 	}
1926 }
1927 EXPORT_SYMBOL(netdev_rx_csum_fault);
1928 #endif
1929 
1930 /* Actually, we should eliminate this check as soon as we know, that:
1931  * 1. IOMMU is present and allows to map all the memory.
1932  * 2. No high memory really exists on this machine.
1933  */
1934 
1935 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1936 {
1937 #ifdef CONFIG_HIGHMEM
1938 	int i;
1939 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1940 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1941 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1942 				return 1;
1943 	}
1944 
1945 	if (PCI_DMA_BUS_IS_PHYS) {
1946 		struct device *pdev = dev->dev.parent;
1947 
1948 		if (!pdev)
1949 			return 0;
1950 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1951 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1952 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1953 				return 1;
1954 		}
1955 	}
1956 #endif
1957 	return 0;
1958 }
1959 
1960 struct dev_gso_cb {
1961 	void (*destructor)(struct sk_buff *skb);
1962 };
1963 
1964 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1965 
1966 static void dev_gso_skb_destructor(struct sk_buff *skb)
1967 {
1968 	struct dev_gso_cb *cb;
1969 
1970 	do {
1971 		struct sk_buff *nskb = skb->next;
1972 
1973 		skb->next = nskb->next;
1974 		nskb->next = NULL;
1975 		kfree_skb(nskb);
1976 	} while (skb->next);
1977 
1978 	cb = DEV_GSO_CB(skb);
1979 	if (cb->destructor)
1980 		cb->destructor(skb);
1981 }
1982 
1983 /**
1984  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1985  *	@skb: buffer to segment
1986  *	@features: device features as applicable to this skb
1987  *
1988  *	This function segments the given skb and stores the list of segments
1989  *	in skb->next.
1990  */
1991 static int dev_gso_segment(struct sk_buff *skb, int features)
1992 {
1993 	struct sk_buff *segs;
1994 
1995 	segs = skb_gso_segment(skb, features);
1996 
1997 	/* Verifying header integrity only. */
1998 	if (!segs)
1999 		return 0;
2000 
2001 	if (IS_ERR(segs))
2002 		return PTR_ERR(segs);
2003 
2004 	skb->next = segs;
2005 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2006 	skb->destructor = dev_gso_skb_destructor;
2007 
2008 	return 0;
2009 }
2010 
2011 /*
2012  * Try to orphan skb early, right before transmission by the device.
2013  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2014  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2015  */
2016 static inline void skb_orphan_try(struct sk_buff *skb)
2017 {
2018 	struct sock *sk = skb->sk;
2019 
2020 	if (sk && !skb_shinfo(skb)->tx_flags) {
2021 		/* skb_tx_hash() wont be able to get sk.
2022 		 * We copy sk_hash into skb->rxhash
2023 		 */
2024 		if (!skb->rxhash)
2025 			skb->rxhash = sk->sk_hash;
2026 		skb_orphan(skb);
2027 	}
2028 }
2029 
2030 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2031 {
2032 	return ((features & NETIF_F_GEN_CSUM) ||
2033 		((features & NETIF_F_V4_CSUM) &&
2034 		 protocol == htons(ETH_P_IP)) ||
2035 		((features & NETIF_F_V6_CSUM) &&
2036 		 protocol == htons(ETH_P_IPV6)) ||
2037 		((features & NETIF_F_FCOE_CRC) &&
2038 		 protocol == htons(ETH_P_FCOE)));
2039 }
2040 
2041 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2042 {
2043 	if (!can_checksum_protocol(features, protocol)) {
2044 		features &= ~NETIF_F_ALL_CSUM;
2045 		features &= ~NETIF_F_SG;
2046 	} else if (illegal_highdma(skb->dev, skb)) {
2047 		features &= ~NETIF_F_SG;
2048 	}
2049 
2050 	return features;
2051 }
2052 
2053 u32 netif_skb_features(struct sk_buff *skb)
2054 {
2055 	__be16 protocol = skb->protocol;
2056 	u32 features = skb->dev->features;
2057 
2058 	if (protocol == htons(ETH_P_8021Q)) {
2059 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2060 		protocol = veh->h_vlan_encapsulated_proto;
2061 	} else if (!vlan_tx_tag_present(skb)) {
2062 		return harmonize_features(skb, protocol, features);
2063 	}
2064 
2065 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2066 
2067 	if (protocol != htons(ETH_P_8021Q)) {
2068 		return harmonize_features(skb, protocol, features);
2069 	} else {
2070 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2071 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2072 		return harmonize_features(skb, protocol, features);
2073 	}
2074 }
2075 EXPORT_SYMBOL(netif_skb_features);
2076 
2077 /*
2078  * Returns true if either:
2079  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2080  *	2. skb is fragmented and the device does not support SG, or if
2081  *	   at least one of fragments is in highmem and device does not
2082  *	   support DMA from it.
2083  */
2084 static inline int skb_needs_linearize(struct sk_buff *skb,
2085 				      int features)
2086 {
2087 	return skb_is_nonlinear(skb) &&
2088 			((skb_has_frag_list(skb) &&
2089 				!(features & NETIF_F_FRAGLIST)) ||
2090 			(skb_shinfo(skb)->nr_frags &&
2091 				!(features & NETIF_F_SG)));
2092 }
2093 
2094 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2095 			struct netdev_queue *txq)
2096 {
2097 	const struct net_device_ops *ops = dev->netdev_ops;
2098 	int rc = NETDEV_TX_OK;
2099 	unsigned int skb_len;
2100 
2101 	if (likely(!skb->next)) {
2102 		u32 features;
2103 
2104 		/*
2105 		 * If device doesn't need skb->dst, release it right now while
2106 		 * its hot in this cpu cache
2107 		 */
2108 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2109 			skb_dst_drop(skb);
2110 
2111 		if (!list_empty(&ptype_all))
2112 			dev_queue_xmit_nit(skb, dev);
2113 
2114 		skb_orphan_try(skb);
2115 
2116 		features = netif_skb_features(skb);
2117 
2118 		if (vlan_tx_tag_present(skb) &&
2119 		    !(features & NETIF_F_HW_VLAN_TX)) {
2120 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2121 			if (unlikely(!skb))
2122 				goto out;
2123 
2124 			skb->vlan_tci = 0;
2125 		}
2126 
2127 		if (netif_needs_gso(skb, features)) {
2128 			if (unlikely(dev_gso_segment(skb, features)))
2129 				goto out_kfree_skb;
2130 			if (skb->next)
2131 				goto gso;
2132 		} else {
2133 			if (skb_needs_linearize(skb, features) &&
2134 			    __skb_linearize(skb))
2135 				goto out_kfree_skb;
2136 
2137 			/* If packet is not checksummed and device does not
2138 			 * support checksumming for this protocol, complete
2139 			 * checksumming here.
2140 			 */
2141 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2142 				skb_set_transport_header(skb,
2143 					skb_checksum_start_offset(skb));
2144 				if (!(features & NETIF_F_ALL_CSUM) &&
2145 				     skb_checksum_help(skb))
2146 					goto out_kfree_skb;
2147 			}
2148 		}
2149 
2150 		skb_len = skb->len;
2151 		rc = ops->ndo_start_xmit(skb, dev);
2152 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2153 		if (rc == NETDEV_TX_OK)
2154 			txq_trans_update(txq);
2155 		return rc;
2156 	}
2157 
2158 gso:
2159 	do {
2160 		struct sk_buff *nskb = skb->next;
2161 
2162 		skb->next = nskb->next;
2163 		nskb->next = NULL;
2164 
2165 		/*
2166 		 * If device doesn't need nskb->dst, release it right now while
2167 		 * its hot in this cpu cache
2168 		 */
2169 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2170 			skb_dst_drop(nskb);
2171 
2172 		skb_len = nskb->len;
2173 		rc = ops->ndo_start_xmit(nskb, dev);
2174 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2175 		if (unlikely(rc != NETDEV_TX_OK)) {
2176 			if (rc & ~NETDEV_TX_MASK)
2177 				goto out_kfree_gso_skb;
2178 			nskb->next = skb->next;
2179 			skb->next = nskb;
2180 			return rc;
2181 		}
2182 		txq_trans_update(txq);
2183 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2184 			return NETDEV_TX_BUSY;
2185 	} while (skb->next);
2186 
2187 out_kfree_gso_skb:
2188 	if (likely(skb->next == NULL))
2189 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2190 out_kfree_skb:
2191 	kfree_skb(skb);
2192 out:
2193 	return rc;
2194 }
2195 
2196 static u32 hashrnd __read_mostly;
2197 
2198 /*
2199  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2200  * to be used as a distribution range.
2201  */
2202 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2203 		  unsigned int num_tx_queues)
2204 {
2205 	u32 hash;
2206 	u16 qoffset = 0;
2207 	u16 qcount = num_tx_queues;
2208 
2209 	if (skb_rx_queue_recorded(skb)) {
2210 		hash = skb_get_rx_queue(skb);
2211 		while (unlikely(hash >= num_tx_queues))
2212 			hash -= num_tx_queues;
2213 		return hash;
2214 	}
2215 
2216 	if (dev->num_tc) {
2217 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2218 		qoffset = dev->tc_to_txq[tc].offset;
2219 		qcount = dev->tc_to_txq[tc].count;
2220 	}
2221 
2222 	if (skb->sk && skb->sk->sk_hash)
2223 		hash = skb->sk->sk_hash;
2224 	else
2225 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2226 	hash = jhash_1word(hash, hashrnd);
2227 
2228 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2229 }
2230 EXPORT_SYMBOL(__skb_tx_hash);
2231 
2232 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2233 {
2234 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2235 		if (net_ratelimit()) {
2236 			pr_warning("%s selects TX queue %d, but "
2237 				"real number of TX queues is %d\n",
2238 				dev->name, queue_index, dev->real_num_tx_queues);
2239 		}
2240 		return 0;
2241 	}
2242 	return queue_index;
2243 }
2244 
2245 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2246 {
2247 #ifdef CONFIG_XPS
2248 	struct xps_dev_maps *dev_maps;
2249 	struct xps_map *map;
2250 	int queue_index = -1;
2251 
2252 	rcu_read_lock();
2253 	dev_maps = rcu_dereference(dev->xps_maps);
2254 	if (dev_maps) {
2255 		map = rcu_dereference(
2256 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2257 		if (map) {
2258 			if (map->len == 1)
2259 				queue_index = map->queues[0];
2260 			else {
2261 				u32 hash;
2262 				if (skb->sk && skb->sk->sk_hash)
2263 					hash = skb->sk->sk_hash;
2264 				else
2265 					hash = (__force u16) skb->protocol ^
2266 					    skb->rxhash;
2267 				hash = jhash_1word(hash, hashrnd);
2268 				queue_index = map->queues[
2269 				    ((u64)hash * map->len) >> 32];
2270 			}
2271 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2272 				queue_index = -1;
2273 		}
2274 	}
2275 	rcu_read_unlock();
2276 
2277 	return queue_index;
2278 #else
2279 	return -1;
2280 #endif
2281 }
2282 
2283 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2284 					struct sk_buff *skb)
2285 {
2286 	int queue_index;
2287 	const struct net_device_ops *ops = dev->netdev_ops;
2288 
2289 	if (dev->real_num_tx_queues == 1)
2290 		queue_index = 0;
2291 	else if (ops->ndo_select_queue) {
2292 		queue_index = ops->ndo_select_queue(dev, skb);
2293 		queue_index = dev_cap_txqueue(dev, queue_index);
2294 	} else {
2295 		struct sock *sk = skb->sk;
2296 		queue_index = sk_tx_queue_get(sk);
2297 
2298 		if (queue_index < 0 || skb->ooo_okay ||
2299 		    queue_index >= dev->real_num_tx_queues) {
2300 			int old_index = queue_index;
2301 
2302 			queue_index = get_xps_queue(dev, skb);
2303 			if (queue_index < 0)
2304 				queue_index = skb_tx_hash(dev, skb);
2305 
2306 			if (queue_index != old_index && sk) {
2307 				struct dst_entry *dst =
2308 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2309 
2310 				if (dst && skb_dst(skb) == dst)
2311 					sk_tx_queue_set(sk, queue_index);
2312 			}
2313 		}
2314 	}
2315 
2316 	skb_set_queue_mapping(skb, queue_index);
2317 	return netdev_get_tx_queue(dev, queue_index);
2318 }
2319 
2320 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2321 				 struct net_device *dev,
2322 				 struct netdev_queue *txq)
2323 {
2324 	spinlock_t *root_lock = qdisc_lock(q);
2325 	bool contended;
2326 	int rc;
2327 
2328 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2329 	qdisc_calculate_pkt_len(skb, q);
2330 	/*
2331 	 * Heuristic to force contended enqueues to serialize on a
2332 	 * separate lock before trying to get qdisc main lock.
2333 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2334 	 * and dequeue packets faster.
2335 	 */
2336 	contended = qdisc_is_running(q);
2337 	if (unlikely(contended))
2338 		spin_lock(&q->busylock);
2339 
2340 	spin_lock(root_lock);
2341 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2342 		kfree_skb(skb);
2343 		rc = NET_XMIT_DROP;
2344 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2345 		   qdisc_run_begin(q)) {
2346 		/*
2347 		 * This is a work-conserving queue; there are no old skbs
2348 		 * waiting to be sent out; and the qdisc is not running -
2349 		 * xmit the skb directly.
2350 		 */
2351 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2352 			skb_dst_force(skb);
2353 
2354 		qdisc_bstats_update(q, skb);
2355 
2356 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2357 			if (unlikely(contended)) {
2358 				spin_unlock(&q->busylock);
2359 				contended = false;
2360 			}
2361 			__qdisc_run(q);
2362 		} else
2363 			qdisc_run_end(q);
2364 
2365 		rc = NET_XMIT_SUCCESS;
2366 	} else {
2367 		skb_dst_force(skb);
2368 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2369 		if (qdisc_run_begin(q)) {
2370 			if (unlikely(contended)) {
2371 				spin_unlock(&q->busylock);
2372 				contended = false;
2373 			}
2374 			__qdisc_run(q);
2375 		}
2376 	}
2377 	spin_unlock(root_lock);
2378 	if (unlikely(contended))
2379 		spin_unlock(&q->busylock);
2380 	return rc;
2381 }
2382 
2383 static DEFINE_PER_CPU(int, xmit_recursion);
2384 #define RECURSION_LIMIT 10
2385 
2386 /**
2387  *	dev_queue_xmit - transmit a buffer
2388  *	@skb: buffer to transmit
2389  *
2390  *	Queue a buffer for transmission to a network device. The caller must
2391  *	have set the device and priority and built the buffer before calling
2392  *	this function. The function can be called from an interrupt.
2393  *
2394  *	A negative errno code is returned on a failure. A success does not
2395  *	guarantee the frame will be transmitted as it may be dropped due
2396  *	to congestion or traffic shaping.
2397  *
2398  * -----------------------------------------------------------------------------------
2399  *      I notice this method can also return errors from the queue disciplines,
2400  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2401  *      be positive.
2402  *
2403  *      Regardless of the return value, the skb is consumed, so it is currently
2404  *      difficult to retry a send to this method.  (You can bump the ref count
2405  *      before sending to hold a reference for retry if you are careful.)
2406  *
2407  *      When calling this method, interrupts MUST be enabled.  This is because
2408  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2409  *          --BLG
2410  */
2411 int dev_queue_xmit(struct sk_buff *skb)
2412 {
2413 	struct net_device *dev = skb->dev;
2414 	struct netdev_queue *txq;
2415 	struct Qdisc *q;
2416 	int rc = -ENOMEM;
2417 
2418 	/* Disable soft irqs for various locks below. Also
2419 	 * stops preemption for RCU.
2420 	 */
2421 	rcu_read_lock_bh();
2422 
2423 	txq = dev_pick_tx(dev, skb);
2424 	q = rcu_dereference_bh(txq->qdisc);
2425 
2426 #ifdef CONFIG_NET_CLS_ACT
2427 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2428 #endif
2429 	trace_net_dev_queue(skb);
2430 	if (q->enqueue) {
2431 		rc = __dev_xmit_skb(skb, q, dev, txq);
2432 		goto out;
2433 	}
2434 
2435 	/* The device has no queue. Common case for software devices:
2436 	   loopback, all the sorts of tunnels...
2437 
2438 	   Really, it is unlikely that netif_tx_lock protection is necessary
2439 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2440 	   counters.)
2441 	   However, it is possible, that they rely on protection
2442 	   made by us here.
2443 
2444 	   Check this and shot the lock. It is not prone from deadlocks.
2445 	   Either shot noqueue qdisc, it is even simpler 8)
2446 	 */
2447 	if (dev->flags & IFF_UP) {
2448 		int cpu = smp_processor_id(); /* ok because BHs are off */
2449 
2450 		if (txq->xmit_lock_owner != cpu) {
2451 
2452 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2453 				goto recursion_alert;
2454 
2455 			HARD_TX_LOCK(dev, txq, cpu);
2456 
2457 			if (!netif_tx_queue_stopped(txq)) {
2458 				__this_cpu_inc(xmit_recursion);
2459 				rc = dev_hard_start_xmit(skb, dev, txq);
2460 				__this_cpu_dec(xmit_recursion);
2461 				if (dev_xmit_complete(rc)) {
2462 					HARD_TX_UNLOCK(dev, txq);
2463 					goto out;
2464 				}
2465 			}
2466 			HARD_TX_UNLOCK(dev, txq);
2467 			if (net_ratelimit())
2468 				printk(KERN_CRIT "Virtual device %s asks to "
2469 				       "queue packet!\n", dev->name);
2470 		} else {
2471 			/* Recursion is detected! It is possible,
2472 			 * unfortunately
2473 			 */
2474 recursion_alert:
2475 			if (net_ratelimit())
2476 				printk(KERN_CRIT "Dead loop on virtual device "
2477 				       "%s, fix it urgently!\n", dev->name);
2478 		}
2479 	}
2480 
2481 	rc = -ENETDOWN;
2482 	rcu_read_unlock_bh();
2483 
2484 	kfree_skb(skb);
2485 	return rc;
2486 out:
2487 	rcu_read_unlock_bh();
2488 	return rc;
2489 }
2490 EXPORT_SYMBOL(dev_queue_xmit);
2491 
2492 
2493 /*=======================================================================
2494 			Receiver routines
2495   =======================================================================*/
2496 
2497 int netdev_max_backlog __read_mostly = 1000;
2498 int netdev_tstamp_prequeue __read_mostly = 1;
2499 int netdev_budget __read_mostly = 300;
2500 int weight_p __read_mostly = 64;            /* old backlog weight */
2501 
2502 /* Called with irq disabled */
2503 static inline void ____napi_schedule(struct softnet_data *sd,
2504 				     struct napi_struct *napi)
2505 {
2506 	list_add_tail(&napi->poll_list, &sd->poll_list);
2507 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2508 }
2509 
2510 /*
2511  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2512  * and src/dst port numbers. Returns a non-zero hash number on success
2513  * and 0 on failure.
2514  */
2515 __u32 __skb_get_rxhash(struct sk_buff *skb)
2516 {
2517 	int nhoff, hash = 0, poff;
2518 	const struct ipv6hdr *ip6;
2519 	const struct iphdr *ip;
2520 	u8 ip_proto;
2521 	u32 addr1, addr2, ihl;
2522 	union {
2523 		u32 v32;
2524 		u16 v16[2];
2525 	} ports;
2526 
2527 	nhoff = skb_network_offset(skb);
2528 
2529 	switch (skb->protocol) {
2530 	case __constant_htons(ETH_P_IP):
2531 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2532 			goto done;
2533 
2534 		ip = (const struct iphdr *) (skb->data + nhoff);
2535 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2536 			ip_proto = 0;
2537 		else
2538 			ip_proto = ip->protocol;
2539 		addr1 = (__force u32) ip->saddr;
2540 		addr2 = (__force u32) ip->daddr;
2541 		ihl = ip->ihl;
2542 		break;
2543 	case __constant_htons(ETH_P_IPV6):
2544 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2545 			goto done;
2546 
2547 		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2548 		ip_proto = ip6->nexthdr;
2549 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2550 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2551 		ihl = (40 >> 2);
2552 		break;
2553 	default:
2554 		goto done;
2555 	}
2556 
2557 	ports.v32 = 0;
2558 	poff = proto_ports_offset(ip_proto);
2559 	if (poff >= 0) {
2560 		nhoff += ihl * 4 + poff;
2561 		if (pskb_may_pull(skb, nhoff + 4)) {
2562 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2563 			if (ports.v16[1] < ports.v16[0])
2564 				swap(ports.v16[0], ports.v16[1]);
2565 		}
2566 	}
2567 
2568 	/* get a consistent hash (same value on both flow directions) */
2569 	if (addr2 < addr1)
2570 		swap(addr1, addr2);
2571 
2572 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2573 	if (!hash)
2574 		hash = 1;
2575 
2576 done:
2577 	return hash;
2578 }
2579 EXPORT_SYMBOL(__skb_get_rxhash);
2580 
2581 #ifdef CONFIG_RPS
2582 
2583 /* One global table that all flow-based protocols share. */
2584 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2585 EXPORT_SYMBOL(rps_sock_flow_table);
2586 
2587 static struct rps_dev_flow *
2588 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2589 	    struct rps_dev_flow *rflow, u16 next_cpu)
2590 {
2591 	u16 tcpu;
2592 
2593 	tcpu = rflow->cpu = next_cpu;
2594 	if (tcpu != RPS_NO_CPU) {
2595 #ifdef CONFIG_RFS_ACCEL
2596 		struct netdev_rx_queue *rxqueue;
2597 		struct rps_dev_flow_table *flow_table;
2598 		struct rps_dev_flow *old_rflow;
2599 		u32 flow_id;
2600 		u16 rxq_index;
2601 		int rc;
2602 
2603 		/* Should we steer this flow to a different hardware queue? */
2604 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2605 		    !(dev->features & NETIF_F_NTUPLE))
2606 			goto out;
2607 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2608 		if (rxq_index == skb_get_rx_queue(skb))
2609 			goto out;
2610 
2611 		rxqueue = dev->_rx + rxq_index;
2612 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2613 		if (!flow_table)
2614 			goto out;
2615 		flow_id = skb->rxhash & flow_table->mask;
2616 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2617 							rxq_index, flow_id);
2618 		if (rc < 0)
2619 			goto out;
2620 		old_rflow = rflow;
2621 		rflow = &flow_table->flows[flow_id];
2622 		rflow->cpu = next_cpu;
2623 		rflow->filter = rc;
2624 		if (old_rflow->filter == rflow->filter)
2625 			old_rflow->filter = RPS_NO_FILTER;
2626 	out:
2627 #endif
2628 		rflow->last_qtail =
2629 			per_cpu(softnet_data, tcpu).input_queue_head;
2630 	}
2631 
2632 	return rflow;
2633 }
2634 
2635 /*
2636  * get_rps_cpu is called from netif_receive_skb and returns the target
2637  * CPU from the RPS map of the receiving queue for a given skb.
2638  * rcu_read_lock must be held on entry.
2639  */
2640 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2641 		       struct rps_dev_flow **rflowp)
2642 {
2643 	struct netdev_rx_queue *rxqueue;
2644 	struct rps_map *map;
2645 	struct rps_dev_flow_table *flow_table;
2646 	struct rps_sock_flow_table *sock_flow_table;
2647 	int cpu = -1;
2648 	u16 tcpu;
2649 
2650 	if (skb_rx_queue_recorded(skb)) {
2651 		u16 index = skb_get_rx_queue(skb);
2652 		if (unlikely(index >= dev->real_num_rx_queues)) {
2653 			WARN_ONCE(dev->real_num_rx_queues > 1,
2654 				  "%s received packet on queue %u, but number "
2655 				  "of RX queues is %u\n",
2656 				  dev->name, index, dev->real_num_rx_queues);
2657 			goto done;
2658 		}
2659 		rxqueue = dev->_rx + index;
2660 	} else
2661 		rxqueue = dev->_rx;
2662 
2663 	map = rcu_dereference(rxqueue->rps_map);
2664 	if (map) {
2665 		if (map->len == 1 &&
2666 		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2667 			tcpu = map->cpus[0];
2668 			if (cpu_online(tcpu))
2669 				cpu = tcpu;
2670 			goto done;
2671 		}
2672 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2673 		goto done;
2674 	}
2675 
2676 	skb_reset_network_header(skb);
2677 	if (!skb_get_rxhash(skb))
2678 		goto done;
2679 
2680 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2681 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2682 	if (flow_table && sock_flow_table) {
2683 		u16 next_cpu;
2684 		struct rps_dev_flow *rflow;
2685 
2686 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2687 		tcpu = rflow->cpu;
2688 
2689 		next_cpu = sock_flow_table->ents[skb->rxhash &
2690 		    sock_flow_table->mask];
2691 
2692 		/*
2693 		 * If the desired CPU (where last recvmsg was done) is
2694 		 * different from current CPU (one in the rx-queue flow
2695 		 * table entry), switch if one of the following holds:
2696 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2697 		 *   - Current CPU is offline.
2698 		 *   - The current CPU's queue tail has advanced beyond the
2699 		 *     last packet that was enqueued using this table entry.
2700 		 *     This guarantees that all previous packets for the flow
2701 		 *     have been dequeued, thus preserving in order delivery.
2702 		 */
2703 		if (unlikely(tcpu != next_cpu) &&
2704 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2705 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2706 		      rflow->last_qtail)) >= 0))
2707 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2708 
2709 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2710 			*rflowp = rflow;
2711 			cpu = tcpu;
2712 			goto done;
2713 		}
2714 	}
2715 
2716 	if (map) {
2717 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2718 
2719 		if (cpu_online(tcpu)) {
2720 			cpu = tcpu;
2721 			goto done;
2722 		}
2723 	}
2724 
2725 done:
2726 	return cpu;
2727 }
2728 
2729 #ifdef CONFIG_RFS_ACCEL
2730 
2731 /**
2732  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2733  * @dev: Device on which the filter was set
2734  * @rxq_index: RX queue index
2735  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2736  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2737  *
2738  * Drivers that implement ndo_rx_flow_steer() should periodically call
2739  * this function for each installed filter and remove the filters for
2740  * which it returns %true.
2741  */
2742 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2743 			 u32 flow_id, u16 filter_id)
2744 {
2745 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2746 	struct rps_dev_flow_table *flow_table;
2747 	struct rps_dev_flow *rflow;
2748 	bool expire = true;
2749 	int cpu;
2750 
2751 	rcu_read_lock();
2752 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2753 	if (flow_table && flow_id <= flow_table->mask) {
2754 		rflow = &flow_table->flows[flow_id];
2755 		cpu = ACCESS_ONCE(rflow->cpu);
2756 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2757 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2758 			   rflow->last_qtail) <
2759 		     (int)(10 * flow_table->mask)))
2760 			expire = false;
2761 	}
2762 	rcu_read_unlock();
2763 	return expire;
2764 }
2765 EXPORT_SYMBOL(rps_may_expire_flow);
2766 
2767 #endif /* CONFIG_RFS_ACCEL */
2768 
2769 /* Called from hardirq (IPI) context */
2770 static void rps_trigger_softirq(void *data)
2771 {
2772 	struct softnet_data *sd = data;
2773 
2774 	____napi_schedule(sd, &sd->backlog);
2775 	sd->received_rps++;
2776 }
2777 
2778 #endif /* CONFIG_RPS */
2779 
2780 /*
2781  * Check if this softnet_data structure is another cpu one
2782  * If yes, queue it to our IPI list and return 1
2783  * If no, return 0
2784  */
2785 static int rps_ipi_queued(struct softnet_data *sd)
2786 {
2787 #ifdef CONFIG_RPS
2788 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2789 
2790 	if (sd != mysd) {
2791 		sd->rps_ipi_next = mysd->rps_ipi_list;
2792 		mysd->rps_ipi_list = sd;
2793 
2794 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2795 		return 1;
2796 	}
2797 #endif /* CONFIG_RPS */
2798 	return 0;
2799 }
2800 
2801 /*
2802  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2803  * queue (may be a remote CPU queue).
2804  */
2805 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2806 			      unsigned int *qtail)
2807 {
2808 	struct softnet_data *sd;
2809 	unsigned long flags;
2810 
2811 	sd = &per_cpu(softnet_data, cpu);
2812 
2813 	local_irq_save(flags);
2814 
2815 	rps_lock(sd);
2816 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2817 		if (skb_queue_len(&sd->input_pkt_queue)) {
2818 enqueue:
2819 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2820 			input_queue_tail_incr_save(sd, qtail);
2821 			rps_unlock(sd);
2822 			local_irq_restore(flags);
2823 			return NET_RX_SUCCESS;
2824 		}
2825 
2826 		/* Schedule NAPI for backlog device
2827 		 * We can use non atomic operation since we own the queue lock
2828 		 */
2829 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2830 			if (!rps_ipi_queued(sd))
2831 				____napi_schedule(sd, &sd->backlog);
2832 		}
2833 		goto enqueue;
2834 	}
2835 
2836 	sd->dropped++;
2837 	rps_unlock(sd);
2838 
2839 	local_irq_restore(flags);
2840 
2841 	atomic_long_inc(&skb->dev->rx_dropped);
2842 	kfree_skb(skb);
2843 	return NET_RX_DROP;
2844 }
2845 
2846 /**
2847  *	netif_rx	-	post buffer to the network code
2848  *	@skb: buffer to post
2849  *
2850  *	This function receives a packet from a device driver and queues it for
2851  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2852  *	may be dropped during processing for congestion control or by the
2853  *	protocol layers.
2854  *
2855  *	return values:
2856  *	NET_RX_SUCCESS	(no congestion)
2857  *	NET_RX_DROP     (packet was dropped)
2858  *
2859  */
2860 
2861 int netif_rx(struct sk_buff *skb)
2862 {
2863 	int ret;
2864 
2865 	/* if netpoll wants it, pretend we never saw it */
2866 	if (netpoll_rx(skb))
2867 		return NET_RX_DROP;
2868 
2869 	if (netdev_tstamp_prequeue)
2870 		net_timestamp_check(skb);
2871 
2872 	trace_netif_rx(skb);
2873 #ifdef CONFIG_RPS
2874 	{
2875 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2876 		int cpu;
2877 
2878 		preempt_disable();
2879 		rcu_read_lock();
2880 
2881 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2882 		if (cpu < 0)
2883 			cpu = smp_processor_id();
2884 
2885 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2886 
2887 		rcu_read_unlock();
2888 		preempt_enable();
2889 	}
2890 #else
2891 	{
2892 		unsigned int qtail;
2893 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2894 		put_cpu();
2895 	}
2896 #endif
2897 	return ret;
2898 }
2899 EXPORT_SYMBOL(netif_rx);
2900 
2901 int netif_rx_ni(struct sk_buff *skb)
2902 {
2903 	int err;
2904 
2905 	preempt_disable();
2906 	err = netif_rx(skb);
2907 	if (local_softirq_pending())
2908 		do_softirq();
2909 	preempt_enable();
2910 
2911 	return err;
2912 }
2913 EXPORT_SYMBOL(netif_rx_ni);
2914 
2915 static void net_tx_action(struct softirq_action *h)
2916 {
2917 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2918 
2919 	if (sd->completion_queue) {
2920 		struct sk_buff *clist;
2921 
2922 		local_irq_disable();
2923 		clist = sd->completion_queue;
2924 		sd->completion_queue = NULL;
2925 		local_irq_enable();
2926 
2927 		while (clist) {
2928 			struct sk_buff *skb = clist;
2929 			clist = clist->next;
2930 
2931 			WARN_ON(atomic_read(&skb->users));
2932 			trace_kfree_skb(skb, net_tx_action);
2933 			__kfree_skb(skb);
2934 		}
2935 	}
2936 
2937 	if (sd->output_queue) {
2938 		struct Qdisc *head;
2939 
2940 		local_irq_disable();
2941 		head = sd->output_queue;
2942 		sd->output_queue = NULL;
2943 		sd->output_queue_tailp = &sd->output_queue;
2944 		local_irq_enable();
2945 
2946 		while (head) {
2947 			struct Qdisc *q = head;
2948 			spinlock_t *root_lock;
2949 
2950 			head = head->next_sched;
2951 
2952 			root_lock = qdisc_lock(q);
2953 			if (spin_trylock(root_lock)) {
2954 				smp_mb__before_clear_bit();
2955 				clear_bit(__QDISC_STATE_SCHED,
2956 					  &q->state);
2957 				qdisc_run(q);
2958 				spin_unlock(root_lock);
2959 			} else {
2960 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2961 					      &q->state)) {
2962 					__netif_reschedule(q);
2963 				} else {
2964 					smp_mb__before_clear_bit();
2965 					clear_bit(__QDISC_STATE_SCHED,
2966 						  &q->state);
2967 				}
2968 			}
2969 		}
2970 	}
2971 }
2972 
2973 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2974     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2975 /* This hook is defined here for ATM LANE */
2976 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2977 			     unsigned char *addr) __read_mostly;
2978 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2979 #endif
2980 
2981 #ifdef CONFIG_NET_CLS_ACT
2982 /* TODO: Maybe we should just force sch_ingress to be compiled in
2983  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2984  * a compare and 2 stores extra right now if we dont have it on
2985  * but have CONFIG_NET_CLS_ACT
2986  * NOTE: This doesn't stop any functionality; if you dont have
2987  * the ingress scheduler, you just can't add policies on ingress.
2988  *
2989  */
2990 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2991 {
2992 	struct net_device *dev = skb->dev;
2993 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2994 	int result = TC_ACT_OK;
2995 	struct Qdisc *q;
2996 
2997 	if (unlikely(MAX_RED_LOOP < ttl++)) {
2998 		if (net_ratelimit())
2999 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3000 			       skb->skb_iif, dev->ifindex);
3001 		return TC_ACT_SHOT;
3002 	}
3003 
3004 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3005 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3006 
3007 	q = rxq->qdisc;
3008 	if (q != &noop_qdisc) {
3009 		spin_lock(qdisc_lock(q));
3010 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3011 			result = qdisc_enqueue_root(skb, q);
3012 		spin_unlock(qdisc_lock(q));
3013 	}
3014 
3015 	return result;
3016 }
3017 
3018 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3019 					 struct packet_type **pt_prev,
3020 					 int *ret, struct net_device *orig_dev)
3021 {
3022 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3023 
3024 	if (!rxq || rxq->qdisc == &noop_qdisc)
3025 		goto out;
3026 
3027 	if (*pt_prev) {
3028 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3029 		*pt_prev = NULL;
3030 	}
3031 
3032 	switch (ing_filter(skb, rxq)) {
3033 	case TC_ACT_SHOT:
3034 	case TC_ACT_STOLEN:
3035 		kfree_skb(skb);
3036 		return NULL;
3037 	}
3038 
3039 out:
3040 	skb->tc_verd = 0;
3041 	return skb;
3042 }
3043 #endif
3044 
3045 /**
3046  *	netdev_rx_handler_register - register receive handler
3047  *	@dev: device to register a handler for
3048  *	@rx_handler: receive handler to register
3049  *	@rx_handler_data: data pointer that is used by rx handler
3050  *
3051  *	Register a receive hander for a device. This handler will then be
3052  *	called from __netif_receive_skb. A negative errno code is returned
3053  *	on a failure.
3054  *
3055  *	The caller must hold the rtnl_mutex.
3056  *
3057  *	For a general description of rx_handler, see enum rx_handler_result.
3058  */
3059 int netdev_rx_handler_register(struct net_device *dev,
3060 			       rx_handler_func_t *rx_handler,
3061 			       void *rx_handler_data)
3062 {
3063 	ASSERT_RTNL();
3064 
3065 	if (dev->rx_handler)
3066 		return -EBUSY;
3067 
3068 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3069 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3070 
3071 	return 0;
3072 }
3073 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3074 
3075 /**
3076  *	netdev_rx_handler_unregister - unregister receive handler
3077  *	@dev: device to unregister a handler from
3078  *
3079  *	Unregister a receive hander from a device.
3080  *
3081  *	The caller must hold the rtnl_mutex.
3082  */
3083 void netdev_rx_handler_unregister(struct net_device *dev)
3084 {
3085 
3086 	ASSERT_RTNL();
3087 	rcu_assign_pointer(dev->rx_handler, NULL);
3088 	rcu_assign_pointer(dev->rx_handler_data, NULL);
3089 }
3090 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3091 
3092 static int __netif_receive_skb(struct sk_buff *skb)
3093 {
3094 	struct packet_type *ptype, *pt_prev;
3095 	rx_handler_func_t *rx_handler;
3096 	struct net_device *orig_dev;
3097 	struct net_device *null_or_dev;
3098 	bool deliver_exact = false;
3099 	int ret = NET_RX_DROP;
3100 	__be16 type;
3101 
3102 	if (!netdev_tstamp_prequeue)
3103 		net_timestamp_check(skb);
3104 
3105 	trace_netif_receive_skb(skb);
3106 
3107 	/* if we've gotten here through NAPI, check netpoll */
3108 	if (netpoll_receive_skb(skb))
3109 		return NET_RX_DROP;
3110 
3111 	if (!skb->skb_iif)
3112 		skb->skb_iif = skb->dev->ifindex;
3113 	orig_dev = skb->dev;
3114 
3115 	skb_reset_network_header(skb);
3116 	skb_reset_transport_header(skb);
3117 	skb->mac_len = skb->network_header - skb->mac_header;
3118 
3119 	pt_prev = NULL;
3120 
3121 	rcu_read_lock();
3122 
3123 another_round:
3124 
3125 	__this_cpu_inc(softnet_data.processed);
3126 
3127 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3128 		skb = vlan_untag(skb);
3129 		if (unlikely(!skb))
3130 			goto out;
3131 	}
3132 
3133 #ifdef CONFIG_NET_CLS_ACT
3134 	if (skb->tc_verd & TC_NCLS) {
3135 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3136 		goto ncls;
3137 	}
3138 #endif
3139 
3140 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3141 		if (!ptype->dev || ptype->dev == skb->dev) {
3142 			if (pt_prev)
3143 				ret = deliver_skb(skb, pt_prev, orig_dev);
3144 			pt_prev = ptype;
3145 		}
3146 	}
3147 
3148 #ifdef CONFIG_NET_CLS_ACT
3149 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3150 	if (!skb)
3151 		goto out;
3152 ncls:
3153 #endif
3154 
3155 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3156 	if (rx_handler) {
3157 		if (pt_prev) {
3158 			ret = deliver_skb(skb, pt_prev, orig_dev);
3159 			pt_prev = NULL;
3160 		}
3161 		switch (rx_handler(&skb)) {
3162 		case RX_HANDLER_CONSUMED:
3163 			goto out;
3164 		case RX_HANDLER_ANOTHER:
3165 			goto another_round;
3166 		case RX_HANDLER_EXACT:
3167 			deliver_exact = true;
3168 		case RX_HANDLER_PASS:
3169 			break;
3170 		default:
3171 			BUG();
3172 		}
3173 	}
3174 
3175 	if (vlan_tx_tag_present(skb)) {
3176 		if (pt_prev) {
3177 			ret = deliver_skb(skb, pt_prev, orig_dev);
3178 			pt_prev = NULL;
3179 		}
3180 		if (vlan_do_receive(&skb)) {
3181 			ret = __netif_receive_skb(skb);
3182 			goto out;
3183 		} else if (unlikely(!skb))
3184 			goto out;
3185 	}
3186 
3187 	/* deliver only exact match when indicated */
3188 	null_or_dev = deliver_exact ? skb->dev : NULL;
3189 
3190 	type = skb->protocol;
3191 	list_for_each_entry_rcu(ptype,
3192 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3193 		if (ptype->type == type &&
3194 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3195 		     ptype->dev == orig_dev)) {
3196 			if (pt_prev)
3197 				ret = deliver_skb(skb, pt_prev, orig_dev);
3198 			pt_prev = ptype;
3199 		}
3200 	}
3201 
3202 	if (pt_prev) {
3203 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3204 	} else {
3205 		atomic_long_inc(&skb->dev->rx_dropped);
3206 		kfree_skb(skb);
3207 		/* Jamal, now you will not able to escape explaining
3208 		 * me how you were going to use this. :-)
3209 		 */
3210 		ret = NET_RX_DROP;
3211 	}
3212 
3213 out:
3214 	rcu_read_unlock();
3215 	return ret;
3216 }
3217 
3218 /**
3219  *	netif_receive_skb - process receive buffer from network
3220  *	@skb: buffer to process
3221  *
3222  *	netif_receive_skb() is the main receive data processing function.
3223  *	It always succeeds. The buffer may be dropped during processing
3224  *	for congestion control or by the protocol layers.
3225  *
3226  *	This function may only be called from softirq context and interrupts
3227  *	should be enabled.
3228  *
3229  *	Return values (usually ignored):
3230  *	NET_RX_SUCCESS: no congestion
3231  *	NET_RX_DROP: packet was dropped
3232  */
3233 int netif_receive_skb(struct sk_buff *skb)
3234 {
3235 	if (netdev_tstamp_prequeue)
3236 		net_timestamp_check(skb);
3237 
3238 	if (skb_defer_rx_timestamp(skb))
3239 		return NET_RX_SUCCESS;
3240 
3241 #ifdef CONFIG_RPS
3242 	{
3243 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3244 		int cpu, ret;
3245 
3246 		rcu_read_lock();
3247 
3248 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3249 
3250 		if (cpu >= 0) {
3251 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3252 			rcu_read_unlock();
3253 		} else {
3254 			rcu_read_unlock();
3255 			ret = __netif_receive_skb(skb);
3256 		}
3257 
3258 		return ret;
3259 	}
3260 #else
3261 	return __netif_receive_skb(skb);
3262 #endif
3263 }
3264 EXPORT_SYMBOL(netif_receive_skb);
3265 
3266 /* Network device is going away, flush any packets still pending
3267  * Called with irqs disabled.
3268  */
3269 static void flush_backlog(void *arg)
3270 {
3271 	struct net_device *dev = arg;
3272 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3273 	struct sk_buff *skb, *tmp;
3274 
3275 	rps_lock(sd);
3276 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3277 		if (skb->dev == dev) {
3278 			__skb_unlink(skb, &sd->input_pkt_queue);
3279 			kfree_skb(skb);
3280 			input_queue_head_incr(sd);
3281 		}
3282 	}
3283 	rps_unlock(sd);
3284 
3285 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3286 		if (skb->dev == dev) {
3287 			__skb_unlink(skb, &sd->process_queue);
3288 			kfree_skb(skb);
3289 			input_queue_head_incr(sd);
3290 		}
3291 	}
3292 }
3293 
3294 static int napi_gro_complete(struct sk_buff *skb)
3295 {
3296 	struct packet_type *ptype;
3297 	__be16 type = skb->protocol;
3298 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3299 	int err = -ENOENT;
3300 
3301 	if (NAPI_GRO_CB(skb)->count == 1) {
3302 		skb_shinfo(skb)->gso_size = 0;
3303 		goto out;
3304 	}
3305 
3306 	rcu_read_lock();
3307 	list_for_each_entry_rcu(ptype, head, list) {
3308 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3309 			continue;
3310 
3311 		err = ptype->gro_complete(skb);
3312 		break;
3313 	}
3314 	rcu_read_unlock();
3315 
3316 	if (err) {
3317 		WARN_ON(&ptype->list == head);
3318 		kfree_skb(skb);
3319 		return NET_RX_SUCCESS;
3320 	}
3321 
3322 out:
3323 	return netif_receive_skb(skb);
3324 }
3325 
3326 inline void napi_gro_flush(struct napi_struct *napi)
3327 {
3328 	struct sk_buff *skb, *next;
3329 
3330 	for (skb = napi->gro_list; skb; skb = next) {
3331 		next = skb->next;
3332 		skb->next = NULL;
3333 		napi_gro_complete(skb);
3334 	}
3335 
3336 	napi->gro_count = 0;
3337 	napi->gro_list = NULL;
3338 }
3339 EXPORT_SYMBOL(napi_gro_flush);
3340 
3341 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3342 {
3343 	struct sk_buff **pp = NULL;
3344 	struct packet_type *ptype;
3345 	__be16 type = skb->protocol;
3346 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3347 	int same_flow;
3348 	int mac_len;
3349 	enum gro_result ret;
3350 
3351 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3352 		goto normal;
3353 
3354 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3355 		goto normal;
3356 
3357 	rcu_read_lock();
3358 	list_for_each_entry_rcu(ptype, head, list) {
3359 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3360 			continue;
3361 
3362 		skb_set_network_header(skb, skb_gro_offset(skb));
3363 		mac_len = skb->network_header - skb->mac_header;
3364 		skb->mac_len = mac_len;
3365 		NAPI_GRO_CB(skb)->same_flow = 0;
3366 		NAPI_GRO_CB(skb)->flush = 0;
3367 		NAPI_GRO_CB(skb)->free = 0;
3368 
3369 		pp = ptype->gro_receive(&napi->gro_list, skb);
3370 		break;
3371 	}
3372 	rcu_read_unlock();
3373 
3374 	if (&ptype->list == head)
3375 		goto normal;
3376 
3377 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3378 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3379 
3380 	if (pp) {
3381 		struct sk_buff *nskb = *pp;
3382 
3383 		*pp = nskb->next;
3384 		nskb->next = NULL;
3385 		napi_gro_complete(nskb);
3386 		napi->gro_count--;
3387 	}
3388 
3389 	if (same_flow)
3390 		goto ok;
3391 
3392 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3393 		goto normal;
3394 
3395 	napi->gro_count++;
3396 	NAPI_GRO_CB(skb)->count = 1;
3397 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3398 	skb->next = napi->gro_list;
3399 	napi->gro_list = skb;
3400 	ret = GRO_HELD;
3401 
3402 pull:
3403 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3404 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3405 
3406 		BUG_ON(skb->end - skb->tail < grow);
3407 
3408 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3409 
3410 		skb->tail += grow;
3411 		skb->data_len -= grow;
3412 
3413 		skb_shinfo(skb)->frags[0].page_offset += grow;
3414 		skb_shinfo(skb)->frags[0].size -= grow;
3415 
3416 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3417 			put_page(skb_shinfo(skb)->frags[0].page);
3418 			memmove(skb_shinfo(skb)->frags,
3419 				skb_shinfo(skb)->frags + 1,
3420 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3421 		}
3422 	}
3423 
3424 ok:
3425 	return ret;
3426 
3427 normal:
3428 	ret = GRO_NORMAL;
3429 	goto pull;
3430 }
3431 EXPORT_SYMBOL(dev_gro_receive);
3432 
3433 static inline gro_result_t
3434 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3435 {
3436 	struct sk_buff *p;
3437 
3438 	for (p = napi->gro_list; p; p = p->next) {
3439 		unsigned long diffs;
3440 
3441 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3442 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3443 		diffs |= compare_ether_header(skb_mac_header(p),
3444 					      skb_gro_mac_header(skb));
3445 		NAPI_GRO_CB(p)->same_flow = !diffs;
3446 		NAPI_GRO_CB(p)->flush = 0;
3447 	}
3448 
3449 	return dev_gro_receive(napi, skb);
3450 }
3451 
3452 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3453 {
3454 	switch (ret) {
3455 	case GRO_NORMAL:
3456 		if (netif_receive_skb(skb))
3457 			ret = GRO_DROP;
3458 		break;
3459 
3460 	case GRO_DROP:
3461 	case GRO_MERGED_FREE:
3462 		kfree_skb(skb);
3463 		break;
3464 
3465 	case GRO_HELD:
3466 	case GRO_MERGED:
3467 		break;
3468 	}
3469 
3470 	return ret;
3471 }
3472 EXPORT_SYMBOL(napi_skb_finish);
3473 
3474 void skb_gro_reset_offset(struct sk_buff *skb)
3475 {
3476 	NAPI_GRO_CB(skb)->data_offset = 0;
3477 	NAPI_GRO_CB(skb)->frag0 = NULL;
3478 	NAPI_GRO_CB(skb)->frag0_len = 0;
3479 
3480 	if (skb->mac_header == skb->tail &&
3481 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3482 		NAPI_GRO_CB(skb)->frag0 =
3483 			page_address(skb_shinfo(skb)->frags[0].page) +
3484 			skb_shinfo(skb)->frags[0].page_offset;
3485 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3486 	}
3487 }
3488 EXPORT_SYMBOL(skb_gro_reset_offset);
3489 
3490 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3491 {
3492 	skb_gro_reset_offset(skb);
3493 
3494 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3495 }
3496 EXPORT_SYMBOL(napi_gro_receive);
3497 
3498 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3499 {
3500 	__skb_pull(skb, skb_headlen(skb));
3501 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3502 	skb->vlan_tci = 0;
3503 	skb->dev = napi->dev;
3504 	skb->skb_iif = 0;
3505 
3506 	napi->skb = skb;
3507 }
3508 
3509 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3510 {
3511 	struct sk_buff *skb = napi->skb;
3512 
3513 	if (!skb) {
3514 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3515 		if (skb)
3516 			napi->skb = skb;
3517 	}
3518 	return skb;
3519 }
3520 EXPORT_SYMBOL(napi_get_frags);
3521 
3522 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3523 			       gro_result_t ret)
3524 {
3525 	switch (ret) {
3526 	case GRO_NORMAL:
3527 	case GRO_HELD:
3528 		skb->protocol = eth_type_trans(skb, skb->dev);
3529 
3530 		if (ret == GRO_HELD)
3531 			skb_gro_pull(skb, -ETH_HLEN);
3532 		else if (netif_receive_skb(skb))
3533 			ret = GRO_DROP;
3534 		break;
3535 
3536 	case GRO_DROP:
3537 	case GRO_MERGED_FREE:
3538 		napi_reuse_skb(napi, skb);
3539 		break;
3540 
3541 	case GRO_MERGED:
3542 		break;
3543 	}
3544 
3545 	return ret;
3546 }
3547 EXPORT_SYMBOL(napi_frags_finish);
3548 
3549 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3550 {
3551 	struct sk_buff *skb = napi->skb;
3552 	struct ethhdr *eth;
3553 	unsigned int hlen;
3554 	unsigned int off;
3555 
3556 	napi->skb = NULL;
3557 
3558 	skb_reset_mac_header(skb);
3559 	skb_gro_reset_offset(skb);
3560 
3561 	off = skb_gro_offset(skb);
3562 	hlen = off + sizeof(*eth);
3563 	eth = skb_gro_header_fast(skb, off);
3564 	if (skb_gro_header_hard(skb, hlen)) {
3565 		eth = skb_gro_header_slow(skb, hlen, off);
3566 		if (unlikely(!eth)) {
3567 			napi_reuse_skb(napi, skb);
3568 			skb = NULL;
3569 			goto out;
3570 		}
3571 	}
3572 
3573 	skb_gro_pull(skb, sizeof(*eth));
3574 
3575 	/*
3576 	 * This works because the only protocols we care about don't require
3577 	 * special handling.  We'll fix it up properly at the end.
3578 	 */
3579 	skb->protocol = eth->h_proto;
3580 
3581 out:
3582 	return skb;
3583 }
3584 EXPORT_SYMBOL(napi_frags_skb);
3585 
3586 gro_result_t napi_gro_frags(struct napi_struct *napi)
3587 {
3588 	struct sk_buff *skb = napi_frags_skb(napi);
3589 
3590 	if (!skb)
3591 		return GRO_DROP;
3592 
3593 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3594 }
3595 EXPORT_SYMBOL(napi_gro_frags);
3596 
3597 /*
3598  * net_rps_action sends any pending IPI's for rps.
3599  * Note: called with local irq disabled, but exits with local irq enabled.
3600  */
3601 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3602 {
3603 #ifdef CONFIG_RPS
3604 	struct softnet_data *remsd = sd->rps_ipi_list;
3605 
3606 	if (remsd) {
3607 		sd->rps_ipi_list = NULL;
3608 
3609 		local_irq_enable();
3610 
3611 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3612 		while (remsd) {
3613 			struct softnet_data *next = remsd->rps_ipi_next;
3614 
3615 			if (cpu_online(remsd->cpu))
3616 				__smp_call_function_single(remsd->cpu,
3617 							   &remsd->csd, 0);
3618 			remsd = next;
3619 		}
3620 	} else
3621 #endif
3622 		local_irq_enable();
3623 }
3624 
3625 static int process_backlog(struct napi_struct *napi, int quota)
3626 {
3627 	int work = 0;
3628 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3629 
3630 #ifdef CONFIG_RPS
3631 	/* Check if we have pending ipi, its better to send them now,
3632 	 * not waiting net_rx_action() end.
3633 	 */
3634 	if (sd->rps_ipi_list) {
3635 		local_irq_disable();
3636 		net_rps_action_and_irq_enable(sd);
3637 	}
3638 #endif
3639 	napi->weight = weight_p;
3640 	local_irq_disable();
3641 	while (work < quota) {
3642 		struct sk_buff *skb;
3643 		unsigned int qlen;
3644 
3645 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3646 			local_irq_enable();
3647 			__netif_receive_skb(skb);
3648 			local_irq_disable();
3649 			input_queue_head_incr(sd);
3650 			if (++work >= quota) {
3651 				local_irq_enable();
3652 				return work;
3653 			}
3654 		}
3655 
3656 		rps_lock(sd);
3657 		qlen = skb_queue_len(&sd->input_pkt_queue);
3658 		if (qlen)
3659 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3660 						   &sd->process_queue);
3661 
3662 		if (qlen < quota - work) {
3663 			/*
3664 			 * Inline a custom version of __napi_complete().
3665 			 * only current cpu owns and manipulates this napi,
3666 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3667 			 * we can use a plain write instead of clear_bit(),
3668 			 * and we dont need an smp_mb() memory barrier.
3669 			 */
3670 			list_del(&napi->poll_list);
3671 			napi->state = 0;
3672 
3673 			quota = work + qlen;
3674 		}
3675 		rps_unlock(sd);
3676 	}
3677 	local_irq_enable();
3678 
3679 	return work;
3680 }
3681 
3682 /**
3683  * __napi_schedule - schedule for receive
3684  * @n: entry to schedule
3685  *
3686  * The entry's receive function will be scheduled to run
3687  */
3688 void __napi_schedule(struct napi_struct *n)
3689 {
3690 	unsigned long flags;
3691 
3692 	local_irq_save(flags);
3693 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3694 	local_irq_restore(flags);
3695 }
3696 EXPORT_SYMBOL(__napi_schedule);
3697 
3698 void __napi_complete(struct napi_struct *n)
3699 {
3700 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3701 	BUG_ON(n->gro_list);
3702 
3703 	list_del(&n->poll_list);
3704 	smp_mb__before_clear_bit();
3705 	clear_bit(NAPI_STATE_SCHED, &n->state);
3706 }
3707 EXPORT_SYMBOL(__napi_complete);
3708 
3709 void napi_complete(struct napi_struct *n)
3710 {
3711 	unsigned long flags;
3712 
3713 	/*
3714 	 * don't let napi dequeue from the cpu poll list
3715 	 * just in case its running on a different cpu
3716 	 */
3717 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3718 		return;
3719 
3720 	napi_gro_flush(n);
3721 	local_irq_save(flags);
3722 	__napi_complete(n);
3723 	local_irq_restore(flags);
3724 }
3725 EXPORT_SYMBOL(napi_complete);
3726 
3727 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3728 		    int (*poll)(struct napi_struct *, int), int weight)
3729 {
3730 	INIT_LIST_HEAD(&napi->poll_list);
3731 	napi->gro_count = 0;
3732 	napi->gro_list = NULL;
3733 	napi->skb = NULL;
3734 	napi->poll = poll;
3735 	napi->weight = weight;
3736 	list_add(&napi->dev_list, &dev->napi_list);
3737 	napi->dev = dev;
3738 #ifdef CONFIG_NETPOLL
3739 	spin_lock_init(&napi->poll_lock);
3740 	napi->poll_owner = -1;
3741 #endif
3742 	set_bit(NAPI_STATE_SCHED, &napi->state);
3743 }
3744 EXPORT_SYMBOL(netif_napi_add);
3745 
3746 void netif_napi_del(struct napi_struct *napi)
3747 {
3748 	struct sk_buff *skb, *next;
3749 
3750 	list_del_init(&napi->dev_list);
3751 	napi_free_frags(napi);
3752 
3753 	for (skb = napi->gro_list; skb; skb = next) {
3754 		next = skb->next;
3755 		skb->next = NULL;
3756 		kfree_skb(skb);
3757 	}
3758 
3759 	napi->gro_list = NULL;
3760 	napi->gro_count = 0;
3761 }
3762 EXPORT_SYMBOL(netif_napi_del);
3763 
3764 static void net_rx_action(struct softirq_action *h)
3765 {
3766 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3767 	unsigned long time_limit = jiffies + 2;
3768 	int budget = netdev_budget;
3769 	void *have;
3770 
3771 	local_irq_disable();
3772 
3773 	while (!list_empty(&sd->poll_list)) {
3774 		struct napi_struct *n;
3775 		int work, weight;
3776 
3777 		/* If softirq window is exhuasted then punt.
3778 		 * Allow this to run for 2 jiffies since which will allow
3779 		 * an average latency of 1.5/HZ.
3780 		 */
3781 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3782 			goto softnet_break;
3783 
3784 		local_irq_enable();
3785 
3786 		/* Even though interrupts have been re-enabled, this
3787 		 * access is safe because interrupts can only add new
3788 		 * entries to the tail of this list, and only ->poll()
3789 		 * calls can remove this head entry from the list.
3790 		 */
3791 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3792 
3793 		have = netpoll_poll_lock(n);
3794 
3795 		weight = n->weight;
3796 
3797 		/* This NAPI_STATE_SCHED test is for avoiding a race
3798 		 * with netpoll's poll_napi().  Only the entity which
3799 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3800 		 * actually make the ->poll() call.  Therefore we avoid
3801 		 * accidentally calling ->poll() when NAPI is not scheduled.
3802 		 */
3803 		work = 0;
3804 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3805 			work = n->poll(n, weight);
3806 			trace_napi_poll(n);
3807 		}
3808 
3809 		WARN_ON_ONCE(work > weight);
3810 
3811 		budget -= work;
3812 
3813 		local_irq_disable();
3814 
3815 		/* Drivers must not modify the NAPI state if they
3816 		 * consume the entire weight.  In such cases this code
3817 		 * still "owns" the NAPI instance and therefore can
3818 		 * move the instance around on the list at-will.
3819 		 */
3820 		if (unlikely(work == weight)) {
3821 			if (unlikely(napi_disable_pending(n))) {
3822 				local_irq_enable();
3823 				napi_complete(n);
3824 				local_irq_disable();
3825 			} else
3826 				list_move_tail(&n->poll_list, &sd->poll_list);
3827 		}
3828 
3829 		netpoll_poll_unlock(have);
3830 	}
3831 out:
3832 	net_rps_action_and_irq_enable(sd);
3833 
3834 #ifdef CONFIG_NET_DMA
3835 	/*
3836 	 * There may not be any more sk_buffs coming right now, so push
3837 	 * any pending DMA copies to hardware
3838 	 */
3839 	dma_issue_pending_all();
3840 #endif
3841 
3842 	return;
3843 
3844 softnet_break:
3845 	sd->time_squeeze++;
3846 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3847 	goto out;
3848 }
3849 
3850 static gifconf_func_t *gifconf_list[NPROTO];
3851 
3852 /**
3853  *	register_gifconf	-	register a SIOCGIF handler
3854  *	@family: Address family
3855  *	@gifconf: Function handler
3856  *
3857  *	Register protocol dependent address dumping routines. The handler
3858  *	that is passed must not be freed or reused until it has been replaced
3859  *	by another handler.
3860  */
3861 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3862 {
3863 	if (family >= NPROTO)
3864 		return -EINVAL;
3865 	gifconf_list[family] = gifconf;
3866 	return 0;
3867 }
3868 EXPORT_SYMBOL(register_gifconf);
3869 
3870 
3871 /*
3872  *	Map an interface index to its name (SIOCGIFNAME)
3873  */
3874 
3875 /*
3876  *	We need this ioctl for efficient implementation of the
3877  *	if_indextoname() function required by the IPv6 API.  Without
3878  *	it, we would have to search all the interfaces to find a
3879  *	match.  --pb
3880  */
3881 
3882 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3883 {
3884 	struct net_device *dev;
3885 	struct ifreq ifr;
3886 
3887 	/*
3888 	 *	Fetch the caller's info block.
3889 	 */
3890 
3891 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3892 		return -EFAULT;
3893 
3894 	rcu_read_lock();
3895 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3896 	if (!dev) {
3897 		rcu_read_unlock();
3898 		return -ENODEV;
3899 	}
3900 
3901 	strcpy(ifr.ifr_name, dev->name);
3902 	rcu_read_unlock();
3903 
3904 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3905 		return -EFAULT;
3906 	return 0;
3907 }
3908 
3909 /*
3910  *	Perform a SIOCGIFCONF call. This structure will change
3911  *	size eventually, and there is nothing I can do about it.
3912  *	Thus we will need a 'compatibility mode'.
3913  */
3914 
3915 static int dev_ifconf(struct net *net, char __user *arg)
3916 {
3917 	struct ifconf ifc;
3918 	struct net_device *dev;
3919 	char __user *pos;
3920 	int len;
3921 	int total;
3922 	int i;
3923 
3924 	/*
3925 	 *	Fetch the caller's info block.
3926 	 */
3927 
3928 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3929 		return -EFAULT;
3930 
3931 	pos = ifc.ifc_buf;
3932 	len = ifc.ifc_len;
3933 
3934 	/*
3935 	 *	Loop over the interfaces, and write an info block for each.
3936 	 */
3937 
3938 	total = 0;
3939 	for_each_netdev(net, dev) {
3940 		for (i = 0; i < NPROTO; i++) {
3941 			if (gifconf_list[i]) {
3942 				int done;
3943 				if (!pos)
3944 					done = gifconf_list[i](dev, NULL, 0);
3945 				else
3946 					done = gifconf_list[i](dev, pos + total,
3947 							       len - total);
3948 				if (done < 0)
3949 					return -EFAULT;
3950 				total += done;
3951 			}
3952 		}
3953 	}
3954 
3955 	/*
3956 	 *	All done.  Write the updated control block back to the caller.
3957 	 */
3958 	ifc.ifc_len = total;
3959 
3960 	/*
3961 	 * 	Both BSD and Solaris return 0 here, so we do too.
3962 	 */
3963 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3964 }
3965 
3966 #ifdef CONFIG_PROC_FS
3967 /*
3968  *	This is invoked by the /proc filesystem handler to display a device
3969  *	in detail.
3970  */
3971 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3972 	__acquires(RCU)
3973 {
3974 	struct net *net = seq_file_net(seq);
3975 	loff_t off;
3976 	struct net_device *dev;
3977 
3978 	rcu_read_lock();
3979 	if (!*pos)
3980 		return SEQ_START_TOKEN;
3981 
3982 	off = 1;
3983 	for_each_netdev_rcu(net, dev)
3984 		if (off++ == *pos)
3985 			return dev;
3986 
3987 	return NULL;
3988 }
3989 
3990 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3991 {
3992 	struct net_device *dev = v;
3993 
3994 	if (v == SEQ_START_TOKEN)
3995 		dev = first_net_device_rcu(seq_file_net(seq));
3996 	else
3997 		dev = next_net_device_rcu(dev);
3998 
3999 	++*pos;
4000 	return dev;
4001 }
4002 
4003 void dev_seq_stop(struct seq_file *seq, void *v)
4004 	__releases(RCU)
4005 {
4006 	rcu_read_unlock();
4007 }
4008 
4009 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4010 {
4011 	struct rtnl_link_stats64 temp;
4012 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4013 
4014 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4015 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4016 		   dev->name, stats->rx_bytes, stats->rx_packets,
4017 		   stats->rx_errors,
4018 		   stats->rx_dropped + stats->rx_missed_errors,
4019 		   stats->rx_fifo_errors,
4020 		   stats->rx_length_errors + stats->rx_over_errors +
4021 		    stats->rx_crc_errors + stats->rx_frame_errors,
4022 		   stats->rx_compressed, stats->multicast,
4023 		   stats->tx_bytes, stats->tx_packets,
4024 		   stats->tx_errors, stats->tx_dropped,
4025 		   stats->tx_fifo_errors, stats->collisions,
4026 		   stats->tx_carrier_errors +
4027 		    stats->tx_aborted_errors +
4028 		    stats->tx_window_errors +
4029 		    stats->tx_heartbeat_errors,
4030 		   stats->tx_compressed);
4031 }
4032 
4033 /*
4034  *	Called from the PROCfs module. This now uses the new arbitrary sized
4035  *	/proc/net interface to create /proc/net/dev
4036  */
4037 static int dev_seq_show(struct seq_file *seq, void *v)
4038 {
4039 	if (v == SEQ_START_TOKEN)
4040 		seq_puts(seq, "Inter-|   Receive                            "
4041 			      "                    |  Transmit\n"
4042 			      " face |bytes    packets errs drop fifo frame "
4043 			      "compressed multicast|bytes    packets errs "
4044 			      "drop fifo colls carrier compressed\n");
4045 	else
4046 		dev_seq_printf_stats(seq, v);
4047 	return 0;
4048 }
4049 
4050 static struct softnet_data *softnet_get_online(loff_t *pos)
4051 {
4052 	struct softnet_data *sd = NULL;
4053 
4054 	while (*pos < nr_cpu_ids)
4055 		if (cpu_online(*pos)) {
4056 			sd = &per_cpu(softnet_data, *pos);
4057 			break;
4058 		} else
4059 			++*pos;
4060 	return sd;
4061 }
4062 
4063 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4064 {
4065 	return softnet_get_online(pos);
4066 }
4067 
4068 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4069 {
4070 	++*pos;
4071 	return softnet_get_online(pos);
4072 }
4073 
4074 static void softnet_seq_stop(struct seq_file *seq, void *v)
4075 {
4076 }
4077 
4078 static int softnet_seq_show(struct seq_file *seq, void *v)
4079 {
4080 	struct softnet_data *sd = v;
4081 
4082 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4083 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4084 		   0, 0, 0, 0, /* was fastroute */
4085 		   sd->cpu_collision, sd->received_rps);
4086 	return 0;
4087 }
4088 
4089 static const struct seq_operations dev_seq_ops = {
4090 	.start = dev_seq_start,
4091 	.next  = dev_seq_next,
4092 	.stop  = dev_seq_stop,
4093 	.show  = dev_seq_show,
4094 };
4095 
4096 static int dev_seq_open(struct inode *inode, struct file *file)
4097 {
4098 	return seq_open_net(inode, file, &dev_seq_ops,
4099 			    sizeof(struct seq_net_private));
4100 }
4101 
4102 static const struct file_operations dev_seq_fops = {
4103 	.owner	 = THIS_MODULE,
4104 	.open    = dev_seq_open,
4105 	.read    = seq_read,
4106 	.llseek  = seq_lseek,
4107 	.release = seq_release_net,
4108 };
4109 
4110 static const struct seq_operations softnet_seq_ops = {
4111 	.start = softnet_seq_start,
4112 	.next  = softnet_seq_next,
4113 	.stop  = softnet_seq_stop,
4114 	.show  = softnet_seq_show,
4115 };
4116 
4117 static int softnet_seq_open(struct inode *inode, struct file *file)
4118 {
4119 	return seq_open(file, &softnet_seq_ops);
4120 }
4121 
4122 static const struct file_operations softnet_seq_fops = {
4123 	.owner	 = THIS_MODULE,
4124 	.open    = softnet_seq_open,
4125 	.read    = seq_read,
4126 	.llseek  = seq_lseek,
4127 	.release = seq_release,
4128 };
4129 
4130 static void *ptype_get_idx(loff_t pos)
4131 {
4132 	struct packet_type *pt = NULL;
4133 	loff_t i = 0;
4134 	int t;
4135 
4136 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4137 		if (i == pos)
4138 			return pt;
4139 		++i;
4140 	}
4141 
4142 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4143 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4144 			if (i == pos)
4145 				return pt;
4146 			++i;
4147 		}
4148 	}
4149 	return NULL;
4150 }
4151 
4152 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4153 	__acquires(RCU)
4154 {
4155 	rcu_read_lock();
4156 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4157 }
4158 
4159 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4160 {
4161 	struct packet_type *pt;
4162 	struct list_head *nxt;
4163 	int hash;
4164 
4165 	++*pos;
4166 	if (v == SEQ_START_TOKEN)
4167 		return ptype_get_idx(0);
4168 
4169 	pt = v;
4170 	nxt = pt->list.next;
4171 	if (pt->type == htons(ETH_P_ALL)) {
4172 		if (nxt != &ptype_all)
4173 			goto found;
4174 		hash = 0;
4175 		nxt = ptype_base[0].next;
4176 	} else
4177 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4178 
4179 	while (nxt == &ptype_base[hash]) {
4180 		if (++hash >= PTYPE_HASH_SIZE)
4181 			return NULL;
4182 		nxt = ptype_base[hash].next;
4183 	}
4184 found:
4185 	return list_entry(nxt, struct packet_type, list);
4186 }
4187 
4188 static void ptype_seq_stop(struct seq_file *seq, void *v)
4189 	__releases(RCU)
4190 {
4191 	rcu_read_unlock();
4192 }
4193 
4194 static int ptype_seq_show(struct seq_file *seq, void *v)
4195 {
4196 	struct packet_type *pt = v;
4197 
4198 	if (v == SEQ_START_TOKEN)
4199 		seq_puts(seq, "Type Device      Function\n");
4200 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4201 		if (pt->type == htons(ETH_P_ALL))
4202 			seq_puts(seq, "ALL ");
4203 		else
4204 			seq_printf(seq, "%04x", ntohs(pt->type));
4205 
4206 		seq_printf(seq, " %-8s %pF\n",
4207 			   pt->dev ? pt->dev->name : "", pt->func);
4208 	}
4209 
4210 	return 0;
4211 }
4212 
4213 static const struct seq_operations ptype_seq_ops = {
4214 	.start = ptype_seq_start,
4215 	.next  = ptype_seq_next,
4216 	.stop  = ptype_seq_stop,
4217 	.show  = ptype_seq_show,
4218 };
4219 
4220 static int ptype_seq_open(struct inode *inode, struct file *file)
4221 {
4222 	return seq_open_net(inode, file, &ptype_seq_ops,
4223 			sizeof(struct seq_net_private));
4224 }
4225 
4226 static const struct file_operations ptype_seq_fops = {
4227 	.owner	 = THIS_MODULE,
4228 	.open    = ptype_seq_open,
4229 	.read    = seq_read,
4230 	.llseek  = seq_lseek,
4231 	.release = seq_release_net,
4232 };
4233 
4234 
4235 static int __net_init dev_proc_net_init(struct net *net)
4236 {
4237 	int rc = -ENOMEM;
4238 
4239 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4240 		goto out;
4241 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4242 		goto out_dev;
4243 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4244 		goto out_softnet;
4245 
4246 	if (wext_proc_init(net))
4247 		goto out_ptype;
4248 	rc = 0;
4249 out:
4250 	return rc;
4251 out_ptype:
4252 	proc_net_remove(net, "ptype");
4253 out_softnet:
4254 	proc_net_remove(net, "softnet_stat");
4255 out_dev:
4256 	proc_net_remove(net, "dev");
4257 	goto out;
4258 }
4259 
4260 static void __net_exit dev_proc_net_exit(struct net *net)
4261 {
4262 	wext_proc_exit(net);
4263 
4264 	proc_net_remove(net, "ptype");
4265 	proc_net_remove(net, "softnet_stat");
4266 	proc_net_remove(net, "dev");
4267 }
4268 
4269 static struct pernet_operations __net_initdata dev_proc_ops = {
4270 	.init = dev_proc_net_init,
4271 	.exit = dev_proc_net_exit,
4272 };
4273 
4274 static int __init dev_proc_init(void)
4275 {
4276 	return register_pernet_subsys(&dev_proc_ops);
4277 }
4278 #else
4279 #define dev_proc_init() 0
4280 #endif	/* CONFIG_PROC_FS */
4281 
4282 
4283 /**
4284  *	netdev_set_master	-	set up master pointer
4285  *	@slave: slave device
4286  *	@master: new master device
4287  *
4288  *	Changes the master device of the slave. Pass %NULL to break the
4289  *	bonding. The caller must hold the RTNL semaphore. On a failure
4290  *	a negative errno code is returned. On success the reference counts
4291  *	are adjusted and the function returns zero.
4292  */
4293 int netdev_set_master(struct net_device *slave, struct net_device *master)
4294 {
4295 	struct net_device *old = slave->master;
4296 
4297 	ASSERT_RTNL();
4298 
4299 	if (master) {
4300 		if (old)
4301 			return -EBUSY;
4302 		dev_hold(master);
4303 	}
4304 
4305 	slave->master = master;
4306 
4307 	if (old)
4308 		dev_put(old);
4309 	return 0;
4310 }
4311 EXPORT_SYMBOL(netdev_set_master);
4312 
4313 /**
4314  *	netdev_set_bond_master	-	set up bonding master/slave pair
4315  *	@slave: slave device
4316  *	@master: new master device
4317  *
4318  *	Changes the master device of the slave. Pass %NULL to break the
4319  *	bonding. The caller must hold the RTNL semaphore. On a failure
4320  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4321  *	to the routing socket and the function returns zero.
4322  */
4323 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4324 {
4325 	int err;
4326 
4327 	ASSERT_RTNL();
4328 
4329 	err = netdev_set_master(slave, master);
4330 	if (err)
4331 		return err;
4332 	if (master)
4333 		slave->flags |= IFF_SLAVE;
4334 	else
4335 		slave->flags &= ~IFF_SLAVE;
4336 
4337 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4338 	return 0;
4339 }
4340 EXPORT_SYMBOL(netdev_set_bond_master);
4341 
4342 static void dev_change_rx_flags(struct net_device *dev, int flags)
4343 {
4344 	const struct net_device_ops *ops = dev->netdev_ops;
4345 
4346 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4347 		ops->ndo_change_rx_flags(dev, flags);
4348 }
4349 
4350 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4351 {
4352 	unsigned short old_flags = dev->flags;
4353 	uid_t uid;
4354 	gid_t gid;
4355 
4356 	ASSERT_RTNL();
4357 
4358 	dev->flags |= IFF_PROMISC;
4359 	dev->promiscuity += inc;
4360 	if (dev->promiscuity == 0) {
4361 		/*
4362 		 * Avoid overflow.
4363 		 * If inc causes overflow, untouch promisc and return error.
4364 		 */
4365 		if (inc < 0)
4366 			dev->flags &= ~IFF_PROMISC;
4367 		else {
4368 			dev->promiscuity -= inc;
4369 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4370 				"set promiscuity failed, promiscuity feature "
4371 				"of device might be broken.\n", dev->name);
4372 			return -EOVERFLOW;
4373 		}
4374 	}
4375 	if (dev->flags != old_flags) {
4376 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4377 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4378 							       "left");
4379 		if (audit_enabled) {
4380 			current_uid_gid(&uid, &gid);
4381 			audit_log(current->audit_context, GFP_ATOMIC,
4382 				AUDIT_ANOM_PROMISCUOUS,
4383 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4384 				dev->name, (dev->flags & IFF_PROMISC),
4385 				(old_flags & IFF_PROMISC),
4386 				audit_get_loginuid(current),
4387 				uid, gid,
4388 				audit_get_sessionid(current));
4389 		}
4390 
4391 		dev_change_rx_flags(dev, IFF_PROMISC);
4392 	}
4393 	return 0;
4394 }
4395 
4396 /**
4397  *	dev_set_promiscuity	- update promiscuity count on a device
4398  *	@dev: device
4399  *	@inc: modifier
4400  *
4401  *	Add or remove promiscuity from a device. While the count in the device
4402  *	remains above zero the interface remains promiscuous. Once it hits zero
4403  *	the device reverts back to normal filtering operation. A negative inc
4404  *	value is used to drop promiscuity on the device.
4405  *	Return 0 if successful or a negative errno code on error.
4406  */
4407 int dev_set_promiscuity(struct net_device *dev, int inc)
4408 {
4409 	unsigned short old_flags = dev->flags;
4410 	int err;
4411 
4412 	err = __dev_set_promiscuity(dev, inc);
4413 	if (err < 0)
4414 		return err;
4415 	if (dev->flags != old_flags)
4416 		dev_set_rx_mode(dev);
4417 	return err;
4418 }
4419 EXPORT_SYMBOL(dev_set_promiscuity);
4420 
4421 /**
4422  *	dev_set_allmulti	- update allmulti count on a device
4423  *	@dev: device
4424  *	@inc: modifier
4425  *
4426  *	Add or remove reception of all multicast frames to a device. While the
4427  *	count in the device remains above zero the interface remains listening
4428  *	to all interfaces. Once it hits zero the device reverts back to normal
4429  *	filtering operation. A negative @inc value is used to drop the counter
4430  *	when releasing a resource needing all multicasts.
4431  *	Return 0 if successful or a negative errno code on error.
4432  */
4433 
4434 int dev_set_allmulti(struct net_device *dev, int inc)
4435 {
4436 	unsigned short old_flags = dev->flags;
4437 
4438 	ASSERT_RTNL();
4439 
4440 	dev->flags |= IFF_ALLMULTI;
4441 	dev->allmulti += inc;
4442 	if (dev->allmulti == 0) {
4443 		/*
4444 		 * Avoid overflow.
4445 		 * If inc causes overflow, untouch allmulti and return error.
4446 		 */
4447 		if (inc < 0)
4448 			dev->flags &= ~IFF_ALLMULTI;
4449 		else {
4450 			dev->allmulti -= inc;
4451 			printk(KERN_WARNING "%s: allmulti touches roof, "
4452 				"set allmulti failed, allmulti feature of "
4453 				"device might be broken.\n", dev->name);
4454 			return -EOVERFLOW;
4455 		}
4456 	}
4457 	if (dev->flags ^ old_flags) {
4458 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4459 		dev_set_rx_mode(dev);
4460 	}
4461 	return 0;
4462 }
4463 EXPORT_SYMBOL(dev_set_allmulti);
4464 
4465 /*
4466  *	Upload unicast and multicast address lists to device and
4467  *	configure RX filtering. When the device doesn't support unicast
4468  *	filtering it is put in promiscuous mode while unicast addresses
4469  *	are present.
4470  */
4471 void __dev_set_rx_mode(struct net_device *dev)
4472 {
4473 	const struct net_device_ops *ops = dev->netdev_ops;
4474 
4475 	/* dev_open will call this function so the list will stay sane. */
4476 	if (!(dev->flags&IFF_UP))
4477 		return;
4478 
4479 	if (!netif_device_present(dev))
4480 		return;
4481 
4482 	if (ops->ndo_set_rx_mode)
4483 		ops->ndo_set_rx_mode(dev);
4484 	else {
4485 		/* Unicast addresses changes may only happen under the rtnl,
4486 		 * therefore calling __dev_set_promiscuity here is safe.
4487 		 */
4488 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4489 			__dev_set_promiscuity(dev, 1);
4490 			dev->uc_promisc = 1;
4491 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4492 			__dev_set_promiscuity(dev, -1);
4493 			dev->uc_promisc = 0;
4494 		}
4495 
4496 		if (ops->ndo_set_multicast_list)
4497 			ops->ndo_set_multicast_list(dev);
4498 	}
4499 }
4500 
4501 void dev_set_rx_mode(struct net_device *dev)
4502 {
4503 	netif_addr_lock_bh(dev);
4504 	__dev_set_rx_mode(dev);
4505 	netif_addr_unlock_bh(dev);
4506 }
4507 
4508 /**
4509  *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4510  *	@dev: device
4511  *	@cmd: memory area for ethtool_ops::get_settings() result
4512  *
4513  *      The cmd arg is initialized properly (cleared and
4514  *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4515  *
4516  *	Return device's ethtool_ops::get_settings() result value or
4517  *	-EOPNOTSUPP when device doesn't expose
4518  *	ethtool_ops::get_settings() operation.
4519  */
4520 int dev_ethtool_get_settings(struct net_device *dev,
4521 			     struct ethtool_cmd *cmd)
4522 {
4523 	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4524 		return -EOPNOTSUPP;
4525 
4526 	memset(cmd, 0, sizeof(struct ethtool_cmd));
4527 	cmd->cmd = ETHTOOL_GSET;
4528 	return dev->ethtool_ops->get_settings(dev, cmd);
4529 }
4530 EXPORT_SYMBOL(dev_ethtool_get_settings);
4531 
4532 /**
4533  *	dev_get_flags - get flags reported to userspace
4534  *	@dev: device
4535  *
4536  *	Get the combination of flag bits exported through APIs to userspace.
4537  */
4538 unsigned dev_get_flags(const struct net_device *dev)
4539 {
4540 	unsigned flags;
4541 
4542 	flags = (dev->flags & ~(IFF_PROMISC |
4543 				IFF_ALLMULTI |
4544 				IFF_RUNNING |
4545 				IFF_LOWER_UP |
4546 				IFF_DORMANT)) |
4547 		(dev->gflags & (IFF_PROMISC |
4548 				IFF_ALLMULTI));
4549 
4550 	if (netif_running(dev)) {
4551 		if (netif_oper_up(dev))
4552 			flags |= IFF_RUNNING;
4553 		if (netif_carrier_ok(dev))
4554 			flags |= IFF_LOWER_UP;
4555 		if (netif_dormant(dev))
4556 			flags |= IFF_DORMANT;
4557 	}
4558 
4559 	return flags;
4560 }
4561 EXPORT_SYMBOL(dev_get_flags);
4562 
4563 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4564 {
4565 	int old_flags = dev->flags;
4566 	int ret;
4567 
4568 	ASSERT_RTNL();
4569 
4570 	/*
4571 	 *	Set the flags on our device.
4572 	 */
4573 
4574 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4575 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4576 			       IFF_AUTOMEDIA)) |
4577 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4578 				    IFF_ALLMULTI));
4579 
4580 	/*
4581 	 *	Load in the correct multicast list now the flags have changed.
4582 	 */
4583 
4584 	if ((old_flags ^ flags) & IFF_MULTICAST)
4585 		dev_change_rx_flags(dev, IFF_MULTICAST);
4586 
4587 	dev_set_rx_mode(dev);
4588 
4589 	/*
4590 	 *	Have we downed the interface. We handle IFF_UP ourselves
4591 	 *	according to user attempts to set it, rather than blindly
4592 	 *	setting it.
4593 	 */
4594 
4595 	ret = 0;
4596 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4597 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4598 
4599 		if (!ret)
4600 			dev_set_rx_mode(dev);
4601 	}
4602 
4603 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4604 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4605 
4606 		dev->gflags ^= IFF_PROMISC;
4607 		dev_set_promiscuity(dev, inc);
4608 	}
4609 
4610 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4611 	   is important. Some (broken) drivers set IFF_PROMISC, when
4612 	   IFF_ALLMULTI is requested not asking us and not reporting.
4613 	 */
4614 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4615 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4616 
4617 		dev->gflags ^= IFF_ALLMULTI;
4618 		dev_set_allmulti(dev, inc);
4619 	}
4620 
4621 	return ret;
4622 }
4623 
4624 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4625 {
4626 	unsigned int changes = dev->flags ^ old_flags;
4627 
4628 	if (changes & IFF_UP) {
4629 		if (dev->flags & IFF_UP)
4630 			call_netdevice_notifiers(NETDEV_UP, dev);
4631 		else
4632 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4633 	}
4634 
4635 	if (dev->flags & IFF_UP &&
4636 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4637 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4638 }
4639 
4640 /**
4641  *	dev_change_flags - change device settings
4642  *	@dev: device
4643  *	@flags: device state flags
4644  *
4645  *	Change settings on device based state flags. The flags are
4646  *	in the userspace exported format.
4647  */
4648 int dev_change_flags(struct net_device *dev, unsigned flags)
4649 {
4650 	int ret, changes;
4651 	int old_flags = dev->flags;
4652 
4653 	ret = __dev_change_flags(dev, flags);
4654 	if (ret < 0)
4655 		return ret;
4656 
4657 	changes = old_flags ^ dev->flags;
4658 	if (changes)
4659 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4660 
4661 	__dev_notify_flags(dev, old_flags);
4662 	return ret;
4663 }
4664 EXPORT_SYMBOL(dev_change_flags);
4665 
4666 /**
4667  *	dev_set_mtu - Change maximum transfer unit
4668  *	@dev: device
4669  *	@new_mtu: new transfer unit
4670  *
4671  *	Change the maximum transfer size of the network device.
4672  */
4673 int dev_set_mtu(struct net_device *dev, int new_mtu)
4674 {
4675 	const struct net_device_ops *ops = dev->netdev_ops;
4676 	int err;
4677 
4678 	if (new_mtu == dev->mtu)
4679 		return 0;
4680 
4681 	/*	MTU must be positive.	 */
4682 	if (new_mtu < 0)
4683 		return -EINVAL;
4684 
4685 	if (!netif_device_present(dev))
4686 		return -ENODEV;
4687 
4688 	err = 0;
4689 	if (ops->ndo_change_mtu)
4690 		err = ops->ndo_change_mtu(dev, new_mtu);
4691 	else
4692 		dev->mtu = new_mtu;
4693 
4694 	if (!err && dev->flags & IFF_UP)
4695 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4696 	return err;
4697 }
4698 EXPORT_SYMBOL(dev_set_mtu);
4699 
4700 /**
4701  *	dev_set_group - Change group this device belongs to
4702  *	@dev: device
4703  *	@new_group: group this device should belong to
4704  */
4705 void dev_set_group(struct net_device *dev, int new_group)
4706 {
4707 	dev->group = new_group;
4708 }
4709 EXPORT_SYMBOL(dev_set_group);
4710 
4711 /**
4712  *	dev_set_mac_address - Change Media Access Control Address
4713  *	@dev: device
4714  *	@sa: new address
4715  *
4716  *	Change the hardware (MAC) address of the device
4717  */
4718 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4719 {
4720 	const struct net_device_ops *ops = dev->netdev_ops;
4721 	int err;
4722 
4723 	if (!ops->ndo_set_mac_address)
4724 		return -EOPNOTSUPP;
4725 	if (sa->sa_family != dev->type)
4726 		return -EINVAL;
4727 	if (!netif_device_present(dev))
4728 		return -ENODEV;
4729 	err = ops->ndo_set_mac_address(dev, sa);
4730 	if (!err)
4731 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4732 	return err;
4733 }
4734 EXPORT_SYMBOL(dev_set_mac_address);
4735 
4736 /*
4737  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4738  */
4739 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4740 {
4741 	int err;
4742 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4743 
4744 	if (!dev)
4745 		return -ENODEV;
4746 
4747 	switch (cmd) {
4748 	case SIOCGIFFLAGS:	/* Get interface flags */
4749 		ifr->ifr_flags = (short) dev_get_flags(dev);
4750 		return 0;
4751 
4752 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4753 				   (currently unused) */
4754 		ifr->ifr_metric = 0;
4755 		return 0;
4756 
4757 	case SIOCGIFMTU:	/* Get the MTU of a device */
4758 		ifr->ifr_mtu = dev->mtu;
4759 		return 0;
4760 
4761 	case SIOCGIFHWADDR:
4762 		if (!dev->addr_len)
4763 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4764 		else
4765 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4766 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4767 		ifr->ifr_hwaddr.sa_family = dev->type;
4768 		return 0;
4769 
4770 	case SIOCGIFSLAVE:
4771 		err = -EINVAL;
4772 		break;
4773 
4774 	case SIOCGIFMAP:
4775 		ifr->ifr_map.mem_start = dev->mem_start;
4776 		ifr->ifr_map.mem_end   = dev->mem_end;
4777 		ifr->ifr_map.base_addr = dev->base_addr;
4778 		ifr->ifr_map.irq       = dev->irq;
4779 		ifr->ifr_map.dma       = dev->dma;
4780 		ifr->ifr_map.port      = dev->if_port;
4781 		return 0;
4782 
4783 	case SIOCGIFINDEX:
4784 		ifr->ifr_ifindex = dev->ifindex;
4785 		return 0;
4786 
4787 	case SIOCGIFTXQLEN:
4788 		ifr->ifr_qlen = dev->tx_queue_len;
4789 		return 0;
4790 
4791 	default:
4792 		/* dev_ioctl() should ensure this case
4793 		 * is never reached
4794 		 */
4795 		WARN_ON(1);
4796 		err = -ENOTTY;
4797 		break;
4798 
4799 	}
4800 	return err;
4801 }
4802 
4803 /*
4804  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4805  */
4806 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4807 {
4808 	int err;
4809 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4810 	const struct net_device_ops *ops;
4811 
4812 	if (!dev)
4813 		return -ENODEV;
4814 
4815 	ops = dev->netdev_ops;
4816 
4817 	switch (cmd) {
4818 	case SIOCSIFFLAGS:	/* Set interface flags */
4819 		return dev_change_flags(dev, ifr->ifr_flags);
4820 
4821 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4822 				   (currently unused) */
4823 		return -EOPNOTSUPP;
4824 
4825 	case SIOCSIFMTU:	/* Set the MTU of a device */
4826 		return dev_set_mtu(dev, ifr->ifr_mtu);
4827 
4828 	case SIOCSIFHWADDR:
4829 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4830 
4831 	case SIOCSIFHWBROADCAST:
4832 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4833 			return -EINVAL;
4834 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4835 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4836 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4837 		return 0;
4838 
4839 	case SIOCSIFMAP:
4840 		if (ops->ndo_set_config) {
4841 			if (!netif_device_present(dev))
4842 				return -ENODEV;
4843 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4844 		}
4845 		return -EOPNOTSUPP;
4846 
4847 	case SIOCADDMULTI:
4848 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4849 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4850 			return -EINVAL;
4851 		if (!netif_device_present(dev))
4852 			return -ENODEV;
4853 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4854 
4855 	case SIOCDELMULTI:
4856 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4857 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4858 			return -EINVAL;
4859 		if (!netif_device_present(dev))
4860 			return -ENODEV;
4861 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4862 
4863 	case SIOCSIFTXQLEN:
4864 		if (ifr->ifr_qlen < 0)
4865 			return -EINVAL;
4866 		dev->tx_queue_len = ifr->ifr_qlen;
4867 		return 0;
4868 
4869 	case SIOCSIFNAME:
4870 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4871 		return dev_change_name(dev, ifr->ifr_newname);
4872 
4873 	/*
4874 	 *	Unknown or private ioctl
4875 	 */
4876 	default:
4877 		if ((cmd >= SIOCDEVPRIVATE &&
4878 		    cmd <= SIOCDEVPRIVATE + 15) ||
4879 		    cmd == SIOCBONDENSLAVE ||
4880 		    cmd == SIOCBONDRELEASE ||
4881 		    cmd == SIOCBONDSETHWADDR ||
4882 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4883 		    cmd == SIOCBONDINFOQUERY ||
4884 		    cmd == SIOCBONDCHANGEACTIVE ||
4885 		    cmd == SIOCGMIIPHY ||
4886 		    cmd == SIOCGMIIREG ||
4887 		    cmd == SIOCSMIIREG ||
4888 		    cmd == SIOCBRADDIF ||
4889 		    cmd == SIOCBRDELIF ||
4890 		    cmd == SIOCSHWTSTAMP ||
4891 		    cmd == SIOCWANDEV) {
4892 			err = -EOPNOTSUPP;
4893 			if (ops->ndo_do_ioctl) {
4894 				if (netif_device_present(dev))
4895 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4896 				else
4897 					err = -ENODEV;
4898 			}
4899 		} else
4900 			err = -EINVAL;
4901 
4902 	}
4903 	return err;
4904 }
4905 
4906 /*
4907  *	This function handles all "interface"-type I/O control requests. The actual
4908  *	'doing' part of this is dev_ifsioc above.
4909  */
4910 
4911 /**
4912  *	dev_ioctl	-	network device ioctl
4913  *	@net: the applicable net namespace
4914  *	@cmd: command to issue
4915  *	@arg: pointer to a struct ifreq in user space
4916  *
4917  *	Issue ioctl functions to devices. This is normally called by the
4918  *	user space syscall interfaces but can sometimes be useful for
4919  *	other purposes. The return value is the return from the syscall if
4920  *	positive or a negative errno code on error.
4921  */
4922 
4923 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4924 {
4925 	struct ifreq ifr;
4926 	int ret;
4927 	char *colon;
4928 
4929 	/* One special case: SIOCGIFCONF takes ifconf argument
4930 	   and requires shared lock, because it sleeps writing
4931 	   to user space.
4932 	 */
4933 
4934 	if (cmd == SIOCGIFCONF) {
4935 		rtnl_lock();
4936 		ret = dev_ifconf(net, (char __user *) arg);
4937 		rtnl_unlock();
4938 		return ret;
4939 	}
4940 	if (cmd == SIOCGIFNAME)
4941 		return dev_ifname(net, (struct ifreq __user *)arg);
4942 
4943 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4944 		return -EFAULT;
4945 
4946 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4947 
4948 	colon = strchr(ifr.ifr_name, ':');
4949 	if (colon)
4950 		*colon = 0;
4951 
4952 	/*
4953 	 *	See which interface the caller is talking about.
4954 	 */
4955 
4956 	switch (cmd) {
4957 	/*
4958 	 *	These ioctl calls:
4959 	 *	- can be done by all.
4960 	 *	- atomic and do not require locking.
4961 	 *	- return a value
4962 	 */
4963 	case SIOCGIFFLAGS:
4964 	case SIOCGIFMETRIC:
4965 	case SIOCGIFMTU:
4966 	case SIOCGIFHWADDR:
4967 	case SIOCGIFSLAVE:
4968 	case SIOCGIFMAP:
4969 	case SIOCGIFINDEX:
4970 	case SIOCGIFTXQLEN:
4971 		dev_load(net, ifr.ifr_name);
4972 		rcu_read_lock();
4973 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4974 		rcu_read_unlock();
4975 		if (!ret) {
4976 			if (colon)
4977 				*colon = ':';
4978 			if (copy_to_user(arg, &ifr,
4979 					 sizeof(struct ifreq)))
4980 				ret = -EFAULT;
4981 		}
4982 		return ret;
4983 
4984 	case SIOCETHTOOL:
4985 		dev_load(net, ifr.ifr_name);
4986 		rtnl_lock();
4987 		ret = dev_ethtool(net, &ifr);
4988 		rtnl_unlock();
4989 		if (!ret) {
4990 			if (colon)
4991 				*colon = ':';
4992 			if (copy_to_user(arg, &ifr,
4993 					 sizeof(struct ifreq)))
4994 				ret = -EFAULT;
4995 		}
4996 		return ret;
4997 
4998 	/*
4999 	 *	These ioctl calls:
5000 	 *	- require superuser power.
5001 	 *	- require strict serialization.
5002 	 *	- return a value
5003 	 */
5004 	case SIOCGMIIPHY:
5005 	case SIOCGMIIREG:
5006 	case SIOCSIFNAME:
5007 		if (!capable(CAP_NET_ADMIN))
5008 			return -EPERM;
5009 		dev_load(net, ifr.ifr_name);
5010 		rtnl_lock();
5011 		ret = dev_ifsioc(net, &ifr, cmd);
5012 		rtnl_unlock();
5013 		if (!ret) {
5014 			if (colon)
5015 				*colon = ':';
5016 			if (copy_to_user(arg, &ifr,
5017 					 sizeof(struct ifreq)))
5018 				ret = -EFAULT;
5019 		}
5020 		return ret;
5021 
5022 	/*
5023 	 *	These ioctl calls:
5024 	 *	- require superuser power.
5025 	 *	- require strict serialization.
5026 	 *	- do not return a value
5027 	 */
5028 	case SIOCSIFFLAGS:
5029 	case SIOCSIFMETRIC:
5030 	case SIOCSIFMTU:
5031 	case SIOCSIFMAP:
5032 	case SIOCSIFHWADDR:
5033 	case SIOCSIFSLAVE:
5034 	case SIOCADDMULTI:
5035 	case SIOCDELMULTI:
5036 	case SIOCSIFHWBROADCAST:
5037 	case SIOCSIFTXQLEN:
5038 	case SIOCSMIIREG:
5039 	case SIOCBONDENSLAVE:
5040 	case SIOCBONDRELEASE:
5041 	case SIOCBONDSETHWADDR:
5042 	case SIOCBONDCHANGEACTIVE:
5043 	case SIOCBRADDIF:
5044 	case SIOCBRDELIF:
5045 	case SIOCSHWTSTAMP:
5046 		if (!capable(CAP_NET_ADMIN))
5047 			return -EPERM;
5048 		/* fall through */
5049 	case SIOCBONDSLAVEINFOQUERY:
5050 	case SIOCBONDINFOQUERY:
5051 		dev_load(net, ifr.ifr_name);
5052 		rtnl_lock();
5053 		ret = dev_ifsioc(net, &ifr, cmd);
5054 		rtnl_unlock();
5055 		return ret;
5056 
5057 	case SIOCGIFMEM:
5058 		/* Get the per device memory space. We can add this but
5059 		 * currently do not support it */
5060 	case SIOCSIFMEM:
5061 		/* Set the per device memory buffer space.
5062 		 * Not applicable in our case */
5063 	case SIOCSIFLINK:
5064 		return -ENOTTY;
5065 
5066 	/*
5067 	 *	Unknown or private ioctl.
5068 	 */
5069 	default:
5070 		if (cmd == SIOCWANDEV ||
5071 		    (cmd >= SIOCDEVPRIVATE &&
5072 		     cmd <= SIOCDEVPRIVATE + 15)) {
5073 			dev_load(net, ifr.ifr_name);
5074 			rtnl_lock();
5075 			ret = dev_ifsioc(net, &ifr, cmd);
5076 			rtnl_unlock();
5077 			if (!ret && copy_to_user(arg, &ifr,
5078 						 sizeof(struct ifreq)))
5079 				ret = -EFAULT;
5080 			return ret;
5081 		}
5082 		/* Take care of Wireless Extensions */
5083 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5084 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5085 		return -ENOTTY;
5086 	}
5087 }
5088 
5089 
5090 /**
5091  *	dev_new_index	-	allocate an ifindex
5092  *	@net: the applicable net namespace
5093  *
5094  *	Returns a suitable unique value for a new device interface
5095  *	number.  The caller must hold the rtnl semaphore or the
5096  *	dev_base_lock to be sure it remains unique.
5097  */
5098 static int dev_new_index(struct net *net)
5099 {
5100 	static int ifindex;
5101 	for (;;) {
5102 		if (++ifindex <= 0)
5103 			ifindex = 1;
5104 		if (!__dev_get_by_index(net, ifindex))
5105 			return ifindex;
5106 	}
5107 }
5108 
5109 /* Delayed registration/unregisteration */
5110 static LIST_HEAD(net_todo_list);
5111 
5112 static void net_set_todo(struct net_device *dev)
5113 {
5114 	list_add_tail(&dev->todo_list, &net_todo_list);
5115 }
5116 
5117 static void rollback_registered_many(struct list_head *head)
5118 {
5119 	struct net_device *dev, *tmp;
5120 
5121 	BUG_ON(dev_boot_phase);
5122 	ASSERT_RTNL();
5123 
5124 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5125 		/* Some devices call without registering
5126 		 * for initialization unwind. Remove those
5127 		 * devices and proceed with the remaining.
5128 		 */
5129 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5130 			pr_debug("unregister_netdevice: device %s/%p never "
5131 				 "was registered\n", dev->name, dev);
5132 
5133 			WARN_ON(1);
5134 			list_del(&dev->unreg_list);
5135 			continue;
5136 		}
5137 		dev->dismantle = true;
5138 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5139 	}
5140 
5141 	/* If device is running, close it first. */
5142 	dev_close_many(head);
5143 
5144 	list_for_each_entry(dev, head, unreg_list) {
5145 		/* And unlink it from device chain. */
5146 		unlist_netdevice(dev);
5147 
5148 		dev->reg_state = NETREG_UNREGISTERING;
5149 	}
5150 
5151 	synchronize_net();
5152 
5153 	list_for_each_entry(dev, head, unreg_list) {
5154 		/* Shutdown queueing discipline. */
5155 		dev_shutdown(dev);
5156 
5157 
5158 		/* Notify protocols, that we are about to destroy
5159 		   this device. They should clean all the things.
5160 		*/
5161 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5162 
5163 		if (!dev->rtnl_link_ops ||
5164 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5165 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5166 
5167 		/*
5168 		 *	Flush the unicast and multicast chains
5169 		 */
5170 		dev_uc_flush(dev);
5171 		dev_mc_flush(dev);
5172 
5173 		if (dev->netdev_ops->ndo_uninit)
5174 			dev->netdev_ops->ndo_uninit(dev);
5175 
5176 		/* Notifier chain MUST detach us from master device. */
5177 		WARN_ON(dev->master);
5178 
5179 		/* Remove entries from kobject tree */
5180 		netdev_unregister_kobject(dev);
5181 	}
5182 
5183 	/* Process any work delayed until the end of the batch */
5184 	dev = list_first_entry(head, struct net_device, unreg_list);
5185 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5186 
5187 	rcu_barrier();
5188 
5189 	list_for_each_entry(dev, head, unreg_list)
5190 		dev_put(dev);
5191 }
5192 
5193 static void rollback_registered(struct net_device *dev)
5194 {
5195 	LIST_HEAD(single);
5196 
5197 	list_add(&dev->unreg_list, &single);
5198 	rollback_registered_many(&single);
5199 	list_del(&single);
5200 }
5201 
5202 u32 netdev_fix_features(struct net_device *dev, u32 features)
5203 {
5204 	/* Fix illegal checksum combinations */
5205 	if ((features & NETIF_F_HW_CSUM) &&
5206 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5207 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5208 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5209 	}
5210 
5211 	if ((features & NETIF_F_NO_CSUM) &&
5212 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5213 		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5214 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5215 	}
5216 
5217 	/* Fix illegal SG+CSUM combinations. */
5218 	if ((features & NETIF_F_SG) &&
5219 	    !(features & NETIF_F_ALL_CSUM)) {
5220 		netdev_dbg(dev,
5221 			"Dropping NETIF_F_SG since no checksum feature.\n");
5222 		features &= ~NETIF_F_SG;
5223 	}
5224 
5225 	/* TSO requires that SG is present as well. */
5226 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5227 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5228 		features &= ~NETIF_F_ALL_TSO;
5229 	}
5230 
5231 	/* TSO ECN requires that TSO is present as well. */
5232 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5233 		features &= ~NETIF_F_TSO_ECN;
5234 
5235 	/* Software GSO depends on SG. */
5236 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5237 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5238 		features &= ~NETIF_F_GSO;
5239 	}
5240 
5241 	/* UFO needs SG and checksumming */
5242 	if (features & NETIF_F_UFO) {
5243 		/* maybe split UFO into V4 and V6? */
5244 		if (!((features & NETIF_F_GEN_CSUM) ||
5245 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5246 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5247 			netdev_dbg(dev,
5248 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5249 			features &= ~NETIF_F_UFO;
5250 		}
5251 
5252 		if (!(features & NETIF_F_SG)) {
5253 			netdev_dbg(dev,
5254 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5255 			features &= ~NETIF_F_UFO;
5256 		}
5257 	}
5258 
5259 	return features;
5260 }
5261 EXPORT_SYMBOL(netdev_fix_features);
5262 
5263 int __netdev_update_features(struct net_device *dev)
5264 {
5265 	u32 features;
5266 	int err = 0;
5267 
5268 	ASSERT_RTNL();
5269 
5270 	features = netdev_get_wanted_features(dev);
5271 
5272 	if (dev->netdev_ops->ndo_fix_features)
5273 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5274 
5275 	/* driver might be less strict about feature dependencies */
5276 	features = netdev_fix_features(dev, features);
5277 
5278 	if (dev->features == features)
5279 		return 0;
5280 
5281 	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5282 		dev->features, features);
5283 
5284 	if (dev->netdev_ops->ndo_set_features)
5285 		err = dev->netdev_ops->ndo_set_features(dev, features);
5286 
5287 	if (unlikely(err < 0)) {
5288 		netdev_err(dev,
5289 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5290 			err, features, dev->features);
5291 		return -1;
5292 	}
5293 
5294 	if (!err)
5295 		dev->features = features;
5296 
5297 	return 1;
5298 }
5299 
5300 /**
5301  *	netdev_update_features - recalculate device features
5302  *	@dev: the device to check
5303  *
5304  *	Recalculate dev->features set and send notifications if it
5305  *	has changed. Should be called after driver or hardware dependent
5306  *	conditions might have changed that influence the features.
5307  */
5308 void netdev_update_features(struct net_device *dev)
5309 {
5310 	if (__netdev_update_features(dev))
5311 		netdev_features_change(dev);
5312 }
5313 EXPORT_SYMBOL(netdev_update_features);
5314 
5315 /**
5316  *	netdev_change_features - recalculate device features
5317  *	@dev: the device to check
5318  *
5319  *	Recalculate dev->features set and send notifications even
5320  *	if they have not changed. Should be called instead of
5321  *	netdev_update_features() if also dev->vlan_features might
5322  *	have changed to allow the changes to be propagated to stacked
5323  *	VLAN devices.
5324  */
5325 void netdev_change_features(struct net_device *dev)
5326 {
5327 	__netdev_update_features(dev);
5328 	netdev_features_change(dev);
5329 }
5330 EXPORT_SYMBOL(netdev_change_features);
5331 
5332 /**
5333  *	netif_stacked_transfer_operstate -	transfer operstate
5334  *	@rootdev: the root or lower level device to transfer state from
5335  *	@dev: the device to transfer operstate to
5336  *
5337  *	Transfer operational state from root to device. This is normally
5338  *	called when a stacking relationship exists between the root
5339  *	device and the device(a leaf device).
5340  */
5341 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5342 					struct net_device *dev)
5343 {
5344 	if (rootdev->operstate == IF_OPER_DORMANT)
5345 		netif_dormant_on(dev);
5346 	else
5347 		netif_dormant_off(dev);
5348 
5349 	if (netif_carrier_ok(rootdev)) {
5350 		if (!netif_carrier_ok(dev))
5351 			netif_carrier_on(dev);
5352 	} else {
5353 		if (netif_carrier_ok(dev))
5354 			netif_carrier_off(dev);
5355 	}
5356 }
5357 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5358 
5359 #ifdef CONFIG_RPS
5360 static int netif_alloc_rx_queues(struct net_device *dev)
5361 {
5362 	unsigned int i, count = dev->num_rx_queues;
5363 	struct netdev_rx_queue *rx;
5364 
5365 	BUG_ON(count < 1);
5366 
5367 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5368 	if (!rx) {
5369 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5370 		return -ENOMEM;
5371 	}
5372 	dev->_rx = rx;
5373 
5374 	for (i = 0; i < count; i++)
5375 		rx[i].dev = dev;
5376 	return 0;
5377 }
5378 #endif
5379 
5380 static void netdev_init_one_queue(struct net_device *dev,
5381 				  struct netdev_queue *queue, void *_unused)
5382 {
5383 	/* Initialize queue lock */
5384 	spin_lock_init(&queue->_xmit_lock);
5385 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5386 	queue->xmit_lock_owner = -1;
5387 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5388 	queue->dev = dev;
5389 }
5390 
5391 static int netif_alloc_netdev_queues(struct net_device *dev)
5392 {
5393 	unsigned int count = dev->num_tx_queues;
5394 	struct netdev_queue *tx;
5395 
5396 	BUG_ON(count < 1);
5397 
5398 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5399 	if (!tx) {
5400 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5401 		       count);
5402 		return -ENOMEM;
5403 	}
5404 	dev->_tx = tx;
5405 
5406 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5407 	spin_lock_init(&dev->tx_global_lock);
5408 
5409 	return 0;
5410 }
5411 
5412 /**
5413  *	register_netdevice	- register a network device
5414  *	@dev: device to register
5415  *
5416  *	Take a completed network device structure and add it to the kernel
5417  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5418  *	chain. 0 is returned on success. A negative errno code is returned
5419  *	on a failure to set up the device, or if the name is a duplicate.
5420  *
5421  *	Callers must hold the rtnl semaphore. You may want
5422  *	register_netdev() instead of this.
5423  *
5424  *	BUGS:
5425  *	The locking appears insufficient to guarantee two parallel registers
5426  *	will not get the same name.
5427  */
5428 
5429 int register_netdevice(struct net_device *dev)
5430 {
5431 	int ret;
5432 	struct net *net = dev_net(dev);
5433 
5434 	BUG_ON(dev_boot_phase);
5435 	ASSERT_RTNL();
5436 
5437 	might_sleep();
5438 
5439 	/* When net_device's are persistent, this will be fatal. */
5440 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5441 	BUG_ON(!net);
5442 
5443 	spin_lock_init(&dev->addr_list_lock);
5444 	netdev_set_addr_lockdep_class(dev);
5445 
5446 	dev->iflink = -1;
5447 
5448 	ret = dev_get_valid_name(dev, dev->name);
5449 	if (ret < 0)
5450 		goto out;
5451 
5452 	/* Init, if this function is available */
5453 	if (dev->netdev_ops->ndo_init) {
5454 		ret = dev->netdev_ops->ndo_init(dev);
5455 		if (ret) {
5456 			if (ret > 0)
5457 				ret = -EIO;
5458 			goto out;
5459 		}
5460 	}
5461 
5462 	dev->ifindex = dev_new_index(net);
5463 	if (dev->iflink == -1)
5464 		dev->iflink = dev->ifindex;
5465 
5466 	/* Transfer changeable features to wanted_features and enable
5467 	 * software offloads (GSO and GRO).
5468 	 */
5469 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5470 	dev->features |= NETIF_F_SOFT_FEATURES;
5471 	dev->wanted_features = dev->features & dev->hw_features;
5472 
5473 	/* Turn on no cache copy if HW is doing checksum */
5474 	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5475 	if ((dev->features & NETIF_F_ALL_CSUM) &&
5476 	    !(dev->features & NETIF_F_NO_CSUM)) {
5477 		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5478 		dev->features |= NETIF_F_NOCACHE_COPY;
5479 	}
5480 
5481 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5482 	 * vlan_dev_init() will do the dev->features check, so these features
5483 	 * are enabled only if supported by underlying device.
5484 	 */
5485 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5486 
5487 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5488 	ret = notifier_to_errno(ret);
5489 	if (ret)
5490 		goto err_uninit;
5491 
5492 	ret = netdev_register_kobject(dev);
5493 	if (ret)
5494 		goto err_uninit;
5495 	dev->reg_state = NETREG_REGISTERED;
5496 
5497 	__netdev_update_features(dev);
5498 
5499 	/*
5500 	 *	Default initial state at registry is that the
5501 	 *	device is present.
5502 	 */
5503 
5504 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5505 
5506 	dev_init_scheduler(dev);
5507 	dev_hold(dev);
5508 	list_netdevice(dev);
5509 
5510 	/* Notify protocols, that a new device appeared. */
5511 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5512 	ret = notifier_to_errno(ret);
5513 	if (ret) {
5514 		rollback_registered(dev);
5515 		dev->reg_state = NETREG_UNREGISTERED;
5516 	}
5517 	/*
5518 	 *	Prevent userspace races by waiting until the network
5519 	 *	device is fully setup before sending notifications.
5520 	 */
5521 	if (!dev->rtnl_link_ops ||
5522 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5523 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5524 
5525 out:
5526 	return ret;
5527 
5528 err_uninit:
5529 	if (dev->netdev_ops->ndo_uninit)
5530 		dev->netdev_ops->ndo_uninit(dev);
5531 	goto out;
5532 }
5533 EXPORT_SYMBOL(register_netdevice);
5534 
5535 /**
5536  *	init_dummy_netdev	- init a dummy network device for NAPI
5537  *	@dev: device to init
5538  *
5539  *	This takes a network device structure and initialize the minimum
5540  *	amount of fields so it can be used to schedule NAPI polls without
5541  *	registering a full blown interface. This is to be used by drivers
5542  *	that need to tie several hardware interfaces to a single NAPI
5543  *	poll scheduler due to HW limitations.
5544  */
5545 int init_dummy_netdev(struct net_device *dev)
5546 {
5547 	/* Clear everything. Note we don't initialize spinlocks
5548 	 * are they aren't supposed to be taken by any of the
5549 	 * NAPI code and this dummy netdev is supposed to be
5550 	 * only ever used for NAPI polls
5551 	 */
5552 	memset(dev, 0, sizeof(struct net_device));
5553 
5554 	/* make sure we BUG if trying to hit standard
5555 	 * register/unregister code path
5556 	 */
5557 	dev->reg_state = NETREG_DUMMY;
5558 
5559 	/* NAPI wants this */
5560 	INIT_LIST_HEAD(&dev->napi_list);
5561 
5562 	/* a dummy interface is started by default */
5563 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5564 	set_bit(__LINK_STATE_START, &dev->state);
5565 
5566 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5567 	 * because users of this 'device' dont need to change
5568 	 * its refcount.
5569 	 */
5570 
5571 	return 0;
5572 }
5573 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5574 
5575 
5576 /**
5577  *	register_netdev	- register a network device
5578  *	@dev: device to register
5579  *
5580  *	Take a completed network device structure and add it to the kernel
5581  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5582  *	chain. 0 is returned on success. A negative errno code is returned
5583  *	on a failure to set up the device, or if the name is a duplicate.
5584  *
5585  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5586  *	and expands the device name if you passed a format string to
5587  *	alloc_netdev.
5588  */
5589 int register_netdev(struct net_device *dev)
5590 {
5591 	int err;
5592 
5593 	rtnl_lock();
5594 	err = register_netdevice(dev);
5595 	rtnl_unlock();
5596 	return err;
5597 }
5598 EXPORT_SYMBOL(register_netdev);
5599 
5600 int netdev_refcnt_read(const struct net_device *dev)
5601 {
5602 	int i, refcnt = 0;
5603 
5604 	for_each_possible_cpu(i)
5605 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5606 	return refcnt;
5607 }
5608 EXPORT_SYMBOL(netdev_refcnt_read);
5609 
5610 /*
5611  * netdev_wait_allrefs - wait until all references are gone.
5612  *
5613  * This is called when unregistering network devices.
5614  *
5615  * Any protocol or device that holds a reference should register
5616  * for netdevice notification, and cleanup and put back the
5617  * reference if they receive an UNREGISTER event.
5618  * We can get stuck here if buggy protocols don't correctly
5619  * call dev_put.
5620  */
5621 static void netdev_wait_allrefs(struct net_device *dev)
5622 {
5623 	unsigned long rebroadcast_time, warning_time;
5624 	int refcnt;
5625 
5626 	linkwatch_forget_dev(dev);
5627 
5628 	rebroadcast_time = warning_time = jiffies;
5629 	refcnt = netdev_refcnt_read(dev);
5630 
5631 	while (refcnt != 0) {
5632 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5633 			rtnl_lock();
5634 
5635 			/* Rebroadcast unregister notification */
5636 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5637 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5638 			 * should have already handle it the first time */
5639 
5640 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5641 				     &dev->state)) {
5642 				/* We must not have linkwatch events
5643 				 * pending on unregister. If this
5644 				 * happens, we simply run the queue
5645 				 * unscheduled, resulting in a noop
5646 				 * for this device.
5647 				 */
5648 				linkwatch_run_queue();
5649 			}
5650 
5651 			__rtnl_unlock();
5652 
5653 			rebroadcast_time = jiffies;
5654 		}
5655 
5656 		msleep(250);
5657 
5658 		refcnt = netdev_refcnt_read(dev);
5659 
5660 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5661 			printk(KERN_EMERG "unregister_netdevice: "
5662 			       "waiting for %s to become free. Usage "
5663 			       "count = %d\n",
5664 			       dev->name, refcnt);
5665 			warning_time = jiffies;
5666 		}
5667 	}
5668 }
5669 
5670 /* The sequence is:
5671  *
5672  *	rtnl_lock();
5673  *	...
5674  *	register_netdevice(x1);
5675  *	register_netdevice(x2);
5676  *	...
5677  *	unregister_netdevice(y1);
5678  *	unregister_netdevice(y2);
5679  *      ...
5680  *	rtnl_unlock();
5681  *	free_netdev(y1);
5682  *	free_netdev(y2);
5683  *
5684  * We are invoked by rtnl_unlock().
5685  * This allows us to deal with problems:
5686  * 1) We can delete sysfs objects which invoke hotplug
5687  *    without deadlocking with linkwatch via keventd.
5688  * 2) Since we run with the RTNL semaphore not held, we can sleep
5689  *    safely in order to wait for the netdev refcnt to drop to zero.
5690  *
5691  * We must not return until all unregister events added during
5692  * the interval the lock was held have been completed.
5693  */
5694 void netdev_run_todo(void)
5695 {
5696 	struct list_head list;
5697 
5698 	/* Snapshot list, allow later requests */
5699 	list_replace_init(&net_todo_list, &list);
5700 
5701 	__rtnl_unlock();
5702 
5703 	while (!list_empty(&list)) {
5704 		struct net_device *dev
5705 			= list_first_entry(&list, struct net_device, todo_list);
5706 		list_del(&dev->todo_list);
5707 
5708 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5709 			printk(KERN_ERR "network todo '%s' but state %d\n",
5710 			       dev->name, dev->reg_state);
5711 			dump_stack();
5712 			continue;
5713 		}
5714 
5715 		dev->reg_state = NETREG_UNREGISTERED;
5716 
5717 		on_each_cpu(flush_backlog, dev, 1);
5718 
5719 		netdev_wait_allrefs(dev);
5720 
5721 		/* paranoia */
5722 		BUG_ON(netdev_refcnt_read(dev));
5723 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5724 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5725 		WARN_ON(dev->dn_ptr);
5726 
5727 		if (dev->destructor)
5728 			dev->destructor(dev);
5729 
5730 		/* Free network device */
5731 		kobject_put(&dev->dev.kobj);
5732 	}
5733 }
5734 
5735 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5736  * fields in the same order, with only the type differing.
5737  */
5738 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5739 				    const struct net_device_stats *netdev_stats)
5740 {
5741 #if BITS_PER_LONG == 64
5742         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5743         memcpy(stats64, netdev_stats, sizeof(*stats64));
5744 #else
5745 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5746 	const unsigned long *src = (const unsigned long *)netdev_stats;
5747 	u64 *dst = (u64 *)stats64;
5748 
5749 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5750 		     sizeof(*stats64) / sizeof(u64));
5751 	for (i = 0; i < n; i++)
5752 		dst[i] = src[i];
5753 #endif
5754 }
5755 
5756 /**
5757  *	dev_get_stats	- get network device statistics
5758  *	@dev: device to get statistics from
5759  *	@storage: place to store stats
5760  *
5761  *	Get network statistics from device. Return @storage.
5762  *	The device driver may provide its own method by setting
5763  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5764  *	otherwise the internal statistics structure is used.
5765  */
5766 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5767 					struct rtnl_link_stats64 *storage)
5768 {
5769 	const struct net_device_ops *ops = dev->netdev_ops;
5770 
5771 	if (ops->ndo_get_stats64) {
5772 		memset(storage, 0, sizeof(*storage));
5773 		ops->ndo_get_stats64(dev, storage);
5774 	} else if (ops->ndo_get_stats) {
5775 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5776 	} else {
5777 		netdev_stats_to_stats64(storage, &dev->stats);
5778 	}
5779 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5780 	return storage;
5781 }
5782 EXPORT_SYMBOL(dev_get_stats);
5783 
5784 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5785 {
5786 	struct netdev_queue *queue = dev_ingress_queue(dev);
5787 
5788 #ifdef CONFIG_NET_CLS_ACT
5789 	if (queue)
5790 		return queue;
5791 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5792 	if (!queue)
5793 		return NULL;
5794 	netdev_init_one_queue(dev, queue, NULL);
5795 	queue->qdisc = &noop_qdisc;
5796 	queue->qdisc_sleeping = &noop_qdisc;
5797 	rcu_assign_pointer(dev->ingress_queue, queue);
5798 #endif
5799 	return queue;
5800 }
5801 
5802 /**
5803  *	alloc_netdev_mqs - allocate network device
5804  *	@sizeof_priv:	size of private data to allocate space for
5805  *	@name:		device name format string
5806  *	@setup:		callback to initialize device
5807  *	@txqs:		the number of TX subqueues to allocate
5808  *	@rxqs:		the number of RX subqueues to allocate
5809  *
5810  *	Allocates a struct net_device with private data area for driver use
5811  *	and performs basic initialization.  Also allocates subquue structs
5812  *	for each queue on the device.
5813  */
5814 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5815 		void (*setup)(struct net_device *),
5816 		unsigned int txqs, unsigned int rxqs)
5817 {
5818 	struct net_device *dev;
5819 	size_t alloc_size;
5820 	struct net_device *p;
5821 
5822 	BUG_ON(strlen(name) >= sizeof(dev->name));
5823 
5824 	if (txqs < 1) {
5825 		pr_err("alloc_netdev: Unable to allocate device "
5826 		       "with zero queues.\n");
5827 		return NULL;
5828 	}
5829 
5830 #ifdef CONFIG_RPS
5831 	if (rxqs < 1) {
5832 		pr_err("alloc_netdev: Unable to allocate device "
5833 		       "with zero RX queues.\n");
5834 		return NULL;
5835 	}
5836 #endif
5837 
5838 	alloc_size = sizeof(struct net_device);
5839 	if (sizeof_priv) {
5840 		/* ensure 32-byte alignment of private area */
5841 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5842 		alloc_size += sizeof_priv;
5843 	}
5844 	/* ensure 32-byte alignment of whole construct */
5845 	alloc_size += NETDEV_ALIGN - 1;
5846 
5847 	p = kzalloc(alloc_size, GFP_KERNEL);
5848 	if (!p) {
5849 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5850 		return NULL;
5851 	}
5852 
5853 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5854 	dev->padded = (char *)dev - (char *)p;
5855 
5856 	dev->pcpu_refcnt = alloc_percpu(int);
5857 	if (!dev->pcpu_refcnt)
5858 		goto free_p;
5859 
5860 	if (dev_addr_init(dev))
5861 		goto free_pcpu;
5862 
5863 	dev_mc_init(dev);
5864 	dev_uc_init(dev);
5865 
5866 	dev_net_set(dev, &init_net);
5867 
5868 	dev->gso_max_size = GSO_MAX_SIZE;
5869 
5870 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5871 	dev->ethtool_ntuple_list.count = 0;
5872 	INIT_LIST_HEAD(&dev->napi_list);
5873 	INIT_LIST_HEAD(&dev->unreg_list);
5874 	INIT_LIST_HEAD(&dev->link_watch_list);
5875 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5876 	setup(dev);
5877 
5878 	dev->num_tx_queues = txqs;
5879 	dev->real_num_tx_queues = txqs;
5880 	if (netif_alloc_netdev_queues(dev))
5881 		goto free_all;
5882 
5883 #ifdef CONFIG_RPS
5884 	dev->num_rx_queues = rxqs;
5885 	dev->real_num_rx_queues = rxqs;
5886 	if (netif_alloc_rx_queues(dev))
5887 		goto free_all;
5888 #endif
5889 
5890 	strcpy(dev->name, name);
5891 	dev->group = INIT_NETDEV_GROUP;
5892 	return dev;
5893 
5894 free_all:
5895 	free_netdev(dev);
5896 	return NULL;
5897 
5898 free_pcpu:
5899 	free_percpu(dev->pcpu_refcnt);
5900 	kfree(dev->_tx);
5901 #ifdef CONFIG_RPS
5902 	kfree(dev->_rx);
5903 #endif
5904 
5905 free_p:
5906 	kfree(p);
5907 	return NULL;
5908 }
5909 EXPORT_SYMBOL(alloc_netdev_mqs);
5910 
5911 /**
5912  *	free_netdev - free network device
5913  *	@dev: device
5914  *
5915  *	This function does the last stage of destroying an allocated device
5916  * 	interface. The reference to the device object is released.
5917  *	If this is the last reference then it will be freed.
5918  */
5919 void free_netdev(struct net_device *dev)
5920 {
5921 	struct napi_struct *p, *n;
5922 
5923 	release_net(dev_net(dev));
5924 
5925 	kfree(dev->_tx);
5926 #ifdef CONFIG_RPS
5927 	kfree(dev->_rx);
5928 #endif
5929 
5930 	kfree(rcu_dereference_raw(dev->ingress_queue));
5931 
5932 	/* Flush device addresses */
5933 	dev_addr_flush(dev);
5934 
5935 	/* Clear ethtool n-tuple list */
5936 	ethtool_ntuple_flush(dev);
5937 
5938 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5939 		netif_napi_del(p);
5940 
5941 	free_percpu(dev->pcpu_refcnt);
5942 	dev->pcpu_refcnt = NULL;
5943 
5944 	/*  Compatibility with error handling in drivers */
5945 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5946 		kfree((char *)dev - dev->padded);
5947 		return;
5948 	}
5949 
5950 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5951 	dev->reg_state = NETREG_RELEASED;
5952 
5953 	/* will free via device release */
5954 	put_device(&dev->dev);
5955 }
5956 EXPORT_SYMBOL(free_netdev);
5957 
5958 /**
5959  *	synchronize_net -  Synchronize with packet receive processing
5960  *
5961  *	Wait for packets currently being received to be done.
5962  *	Does not block later packets from starting.
5963  */
5964 void synchronize_net(void)
5965 {
5966 	might_sleep();
5967 	if (rtnl_is_locked())
5968 		synchronize_rcu_expedited();
5969 	else
5970 		synchronize_rcu();
5971 }
5972 EXPORT_SYMBOL(synchronize_net);
5973 
5974 /**
5975  *	unregister_netdevice_queue - remove device from the kernel
5976  *	@dev: device
5977  *	@head: list
5978  *
5979  *	This function shuts down a device interface and removes it
5980  *	from the kernel tables.
5981  *	If head not NULL, device is queued to be unregistered later.
5982  *
5983  *	Callers must hold the rtnl semaphore.  You may want
5984  *	unregister_netdev() instead of this.
5985  */
5986 
5987 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5988 {
5989 	ASSERT_RTNL();
5990 
5991 	if (head) {
5992 		list_move_tail(&dev->unreg_list, head);
5993 	} else {
5994 		rollback_registered(dev);
5995 		/* Finish processing unregister after unlock */
5996 		net_set_todo(dev);
5997 	}
5998 }
5999 EXPORT_SYMBOL(unregister_netdevice_queue);
6000 
6001 /**
6002  *	unregister_netdevice_many - unregister many devices
6003  *	@head: list of devices
6004  */
6005 void unregister_netdevice_many(struct list_head *head)
6006 {
6007 	struct net_device *dev;
6008 
6009 	if (!list_empty(head)) {
6010 		rollback_registered_many(head);
6011 		list_for_each_entry(dev, head, unreg_list)
6012 			net_set_todo(dev);
6013 	}
6014 }
6015 EXPORT_SYMBOL(unregister_netdevice_many);
6016 
6017 /**
6018  *	unregister_netdev - remove device from the kernel
6019  *	@dev: device
6020  *
6021  *	This function shuts down a device interface and removes it
6022  *	from the kernel tables.
6023  *
6024  *	This is just a wrapper for unregister_netdevice that takes
6025  *	the rtnl semaphore.  In general you want to use this and not
6026  *	unregister_netdevice.
6027  */
6028 void unregister_netdev(struct net_device *dev)
6029 {
6030 	rtnl_lock();
6031 	unregister_netdevice(dev);
6032 	rtnl_unlock();
6033 }
6034 EXPORT_SYMBOL(unregister_netdev);
6035 
6036 /**
6037  *	dev_change_net_namespace - move device to different nethost namespace
6038  *	@dev: device
6039  *	@net: network namespace
6040  *	@pat: If not NULL name pattern to try if the current device name
6041  *	      is already taken in the destination network namespace.
6042  *
6043  *	This function shuts down a device interface and moves it
6044  *	to a new network namespace. On success 0 is returned, on
6045  *	a failure a netagive errno code is returned.
6046  *
6047  *	Callers must hold the rtnl semaphore.
6048  */
6049 
6050 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6051 {
6052 	int err;
6053 
6054 	ASSERT_RTNL();
6055 
6056 	/* Don't allow namespace local devices to be moved. */
6057 	err = -EINVAL;
6058 	if (dev->features & NETIF_F_NETNS_LOCAL)
6059 		goto out;
6060 
6061 	/* Ensure the device has been registrered */
6062 	err = -EINVAL;
6063 	if (dev->reg_state != NETREG_REGISTERED)
6064 		goto out;
6065 
6066 	/* Get out if there is nothing todo */
6067 	err = 0;
6068 	if (net_eq(dev_net(dev), net))
6069 		goto out;
6070 
6071 	/* Pick the destination device name, and ensure
6072 	 * we can use it in the destination network namespace.
6073 	 */
6074 	err = -EEXIST;
6075 	if (__dev_get_by_name(net, dev->name)) {
6076 		/* We get here if we can't use the current device name */
6077 		if (!pat)
6078 			goto out;
6079 		if (dev_get_valid_name(dev, pat) < 0)
6080 			goto out;
6081 	}
6082 
6083 	/*
6084 	 * And now a mini version of register_netdevice unregister_netdevice.
6085 	 */
6086 
6087 	/* If device is running close it first. */
6088 	dev_close(dev);
6089 
6090 	/* And unlink it from device chain */
6091 	err = -ENODEV;
6092 	unlist_netdevice(dev);
6093 
6094 	synchronize_net();
6095 
6096 	/* Shutdown queueing discipline. */
6097 	dev_shutdown(dev);
6098 
6099 	/* Notify protocols, that we are about to destroy
6100 	   this device. They should clean all the things.
6101 
6102 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6103 	   This is wanted because this way 8021q and macvlan know
6104 	   the device is just moving and can keep their slaves up.
6105 	*/
6106 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6107 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6108 
6109 	/*
6110 	 *	Flush the unicast and multicast chains
6111 	 */
6112 	dev_uc_flush(dev);
6113 	dev_mc_flush(dev);
6114 
6115 	/* Actually switch the network namespace */
6116 	dev_net_set(dev, net);
6117 
6118 	/* If there is an ifindex conflict assign a new one */
6119 	if (__dev_get_by_index(net, dev->ifindex)) {
6120 		int iflink = (dev->iflink == dev->ifindex);
6121 		dev->ifindex = dev_new_index(net);
6122 		if (iflink)
6123 			dev->iflink = dev->ifindex;
6124 	}
6125 
6126 	/* Fixup kobjects */
6127 	err = device_rename(&dev->dev, dev->name);
6128 	WARN_ON(err);
6129 
6130 	/* Add the device back in the hashes */
6131 	list_netdevice(dev);
6132 
6133 	/* Notify protocols, that a new device appeared. */
6134 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6135 
6136 	/*
6137 	 *	Prevent userspace races by waiting until the network
6138 	 *	device is fully setup before sending notifications.
6139 	 */
6140 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6141 
6142 	synchronize_net();
6143 	err = 0;
6144 out:
6145 	return err;
6146 }
6147 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6148 
6149 static int dev_cpu_callback(struct notifier_block *nfb,
6150 			    unsigned long action,
6151 			    void *ocpu)
6152 {
6153 	struct sk_buff **list_skb;
6154 	struct sk_buff *skb;
6155 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6156 	struct softnet_data *sd, *oldsd;
6157 
6158 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6159 		return NOTIFY_OK;
6160 
6161 	local_irq_disable();
6162 	cpu = smp_processor_id();
6163 	sd = &per_cpu(softnet_data, cpu);
6164 	oldsd = &per_cpu(softnet_data, oldcpu);
6165 
6166 	/* Find end of our completion_queue. */
6167 	list_skb = &sd->completion_queue;
6168 	while (*list_skb)
6169 		list_skb = &(*list_skb)->next;
6170 	/* Append completion queue from offline CPU. */
6171 	*list_skb = oldsd->completion_queue;
6172 	oldsd->completion_queue = NULL;
6173 
6174 	/* Append output queue from offline CPU. */
6175 	if (oldsd->output_queue) {
6176 		*sd->output_queue_tailp = oldsd->output_queue;
6177 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6178 		oldsd->output_queue = NULL;
6179 		oldsd->output_queue_tailp = &oldsd->output_queue;
6180 	}
6181 
6182 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6183 	local_irq_enable();
6184 
6185 	/* Process offline CPU's input_pkt_queue */
6186 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6187 		netif_rx(skb);
6188 		input_queue_head_incr(oldsd);
6189 	}
6190 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6191 		netif_rx(skb);
6192 		input_queue_head_incr(oldsd);
6193 	}
6194 
6195 	return NOTIFY_OK;
6196 }
6197 
6198 
6199 /**
6200  *	netdev_increment_features - increment feature set by one
6201  *	@all: current feature set
6202  *	@one: new feature set
6203  *	@mask: mask feature set
6204  *
6205  *	Computes a new feature set after adding a device with feature set
6206  *	@one to the master device with current feature set @all.  Will not
6207  *	enable anything that is off in @mask. Returns the new feature set.
6208  */
6209 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6210 {
6211 	if (mask & NETIF_F_GEN_CSUM)
6212 		mask |= NETIF_F_ALL_CSUM;
6213 	mask |= NETIF_F_VLAN_CHALLENGED;
6214 
6215 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6216 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6217 
6218 	/* If device needs checksumming, downgrade to it. */
6219 	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6220 		all &= ~NETIF_F_NO_CSUM;
6221 
6222 	/* If one device supports hw checksumming, set for all. */
6223 	if (all & NETIF_F_GEN_CSUM)
6224 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6225 
6226 	return all;
6227 }
6228 EXPORT_SYMBOL(netdev_increment_features);
6229 
6230 static struct hlist_head *netdev_create_hash(void)
6231 {
6232 	int i;
6233 	struct hlist_head *hash;
6234 
6235 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6236 	if (hash != NULL)
6237 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6238 			INIT_HLIST_HEAD(&hash[i]);
6239 
6240 	return hash;
6241 }
6242 
6243 /* Initialize per network namespace state */
6244 static int __net_init netdev_init(struct net *net)
6245 {
6246 	INIT_LIST_HEAD(&net->dev_base_head);
6247 
6248 	net->dev_name_head = netdev_create_hash();
6249 	if (net->dev_name_head == NULL)
6250 		goto err_name;
6251 
6252 	net->dev_index_head = netdev_create_hash();
6253 	if (net->dev_index_head == NULL)
6254 		goto err_idx;
6255 
6256 	return 0;
6257 
6258 err_idx:
6259 	kfree(net->dev_name_head);
6260 err_name:
6261 	return -ENOMEM;
6262 }
6263 
6264 /**
6265  *	netdev_drivername - network driver for the device
6266  *	@dev: network device
6267  *	@buffer: buffer for resulting name
6268  *	@len: size of buffer
6269  *
6270  *	Determine network driver for device.
6271  */
6272 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6273 {
6274 	const struct device_driver *driver;
6275 	const struct device *parent;
6276 
6277 	if (len <= 0 || !buffer)
6278 		return buffer;
6279 	buffer[0] = 0;
6280 
6281 	parent = dev->dev.parent;
6282 
6283 	if (!parent)
6284 		return buffer;
6285 
6286 	driver = parent->driver;
6287 	if (driver && driver->name)
6288 		strlcpy(buffer, driver->name, len);
6289 	return buffer;
6290 }
6291 
6292 static int __netdev_printk(const char *level, const struct net_device *dev,
6293 			   struct va_format *vaf)
6294 {
6295 	int r;
6296 
6297 	if (dev && dev->dev.parent)
6298 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6299 			       netdev_name(dev), vaf);
6300 	else if (dev)
6301 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6302 	else
6303 		r = printk("%s(NULL net_device): %pV", level, vaf);
6304 
6305 	return r;
6306 }
6307 
6308 int netdev_printk(const char *level, const struct net_device *dev,
6309 		  const char *format, ...)
6310 {
6311 	struct va_format vaf;
6312 	va_list args;
6313 	int r;
6314 
6315 	va_start(args, format);
6316 
6317 	vaf.fmt = format;
6318 	vaf.va = &args;
6319 
6320 	r = __netdev_printk(level, dev, &vaf);
6321 	va_end(args);
6322 
6323 	return r;
6324 }
6325 EXPORT_SYMBOL(netdev_printk);
6326 
6327 #define define_netdev_printk_level(func, level)			\
6328 int func(const struct net_device *dev, const char *fmt, ...)	\
6329 {								\
6330 	int r;							\
6331 	struct va_format vaf;					\
6332 	va_list args;						\
6333 								\
6334 	va_start(args, fmt);					\
6335 								\
6336 	vaf.fmt = fmt;						\
6337 	vaf.va = &args;						\
6338 								\
6339 	r = __netdev_printk(level, dev, &vaf);			\
6340 	va_end(args);						\
6341 								\
6342 	return r;						\
6343 }								\
6344 EXPORT_SYMBOL(func);
6345 
6346 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6347 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6348 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6349 define_netdev_printk_level(netdev_err, KERN_ERR);
6350 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6351 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6352 define_netdev_printk_level(netdev_info, KERN_INFO);
6353 
6354 static void __net_exit netdev_exit(struct net *net)
6355 {
6356 	kfree(net->dev_name_head);
6357 	kfree(net->dev_index_head);
6358 }
6359 
6360 static struct pernet_operations __net_initdata netdev_net_ops = {
6361 	.init = netdev_init,
6362 	.exit = netdev_exit,
6363 };
6364 
6365 static void __net_exit default_device_exit(struct net *net)
6366 {
6367 	struct net_device *dev, *aux;
6368 	/*
6369 	 * Push all migratable network devices back to the
6370 	 * initial network namespace
6371 	 */
6372 	rtnl_lock();
6373 	for_each_netdev_safe(net, dev, aux) {
6374 		int err;
6375 		char fb_name[IFNAMSIZ];
6376 
6377 		/* Ignore unmoveable devices (i.e. loopback) */
6378 		if (dev->features & NETIF_F_NETNS_LOCAL)
6379 			continue;
6380 
6381 		/* Leave virtual devices for the generic cleanup */
6382 		if (dev->rtnl_link_ops)
6383 			continue;
6384 
6385 		/* Push remaining network devices to init_net */
6386 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6387 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6388 		if (err) {
6389 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6390 				__func__, dev->name, err);
6391 			BUG();
6392 		}
6393 	}
6394 	rtnl_unlock();
6395 }
6396 
6397 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6398 {
6399 	/* At exit all network devices most be removed from a network
6400 	 * namespace.  Do this in the reverse order of registration.
6401 	 * Do this across as many network namespaces as possible to
6402 	 * improve batching efficiency.
6403 	 */
6404 	struct net_device *dev;
6405 	struct net *net;
6406 	LIST_HEAD(dev_kill_list);
6407 
6408 	rtnl_lock();
6409 	list_for_each_entry(net, net_list, exit_list) {
6410 		for_each_netdev_reverse(net, dev) {
6411 			if (dev->rtnl_link_ops)
6412 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6413 			else
6414 				unregister_netdevice_queue(dev, &dev_kill_list);
6415 		}
6416 	}
6417 	unregister_netdevice_many(&dev_kill_list);
6418 	list_del(&dev_kill_list);
6419 	rtnl_unlock();
6420 }
6421 
6422 static struct pernet_operations __net_initdata default_device_ops = {
6423 	.exit = default_device_exit,
6424 	.exit_batch = default_device_exit_batch,
6425 };
6426 
6427 /*
6428  *	Initialize the DEV module. At boot time this walks the device list and
6429  *	unhooks any devices that fail to initialise (normally hardware not
6430  *	present) and leaves us with a valid list of present and active devices.
6431  *
6432  */
6433 
6434 /*
6435  *       This is called single threaded during boot, so no need
6436  *       to take the rtnl semaphore.
6437  */
6438 static int __init net_dev_init(void)
6439 {
6440 	int i, rc = -ENOMEM;
6441 
6442 	BUG_ON(!dev_boot_phase);
6443 
6444 	if (dev_proc_init())
6445 		goto out;
6446 
6447 	if (netdev_kobject_init())
6448 		goto out;
6449 
6450 	INIT_LIST_HEAD(&ptype_all);
6451 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6452 		INIT_LIST_HEAD(&ptype_base[i]);
6453 
6454 	if (register_pernet_subsys(&netdev_net_ops))
6455 		goto out;
6456 
6457 	/*
6458 	 *	Initialise the packet receive queues.
6459 	 */
6460 
6461 	for_each_possible_cpu(i) {
6462 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6463 
6464 		memset(sd, 0, sizeof(*sd));
6465 		skb_queue_head_init(&sd->input_pkt_queue);
6466 		skb_queue_head_init(&sd->process_queue);
6467 		sd->completion_queue = NULL;
6468 		INIT_LIST_HEAD(&sd->poll_list);
6469 		sd->output_queue = NULL;
6470 		sd->output_queue_tailp = &sd->output_queue;
6471 #ifdef CONFIG_RPS
6472 		sd->csd.func = rps_trigger_softirq;
6473 		sd->csd.info = sd;
6474 		sd->csd.flags = 0;
6475 		sd->cpu = i;
6476 #endif
6477 
6478 		sd->backlog.poll = process_backlog;
6479 		sd->backlog.weight = weight_p;
6480 		sd->backlog.gro_list = NULL;
6481 		sd->backlog.gro_count = 0;
6482 	}
6483 
6484 	dev_boot_phase = 0;
6485 
6486 	/* The loopback device is special if any other network devices
6487 	 * is present in a network namespace the loopback device must
6488 	 * be present. Since we now dynamically allocate and free the
6489 	 * loopback device ensure this invariant is maintained by
6490 	 * keeping the loopback device as the first device on the
6491 	 * list of network devices.  Ensuring the loopback devices
6492 	 * is the first device that appears and the last network device
6493 	 * that disappears.
6494 	 */
6495 	if (register_pernet_device(&loopback_net_ops))
6496 		goto out;
6497 
6498 	if (register_pernet_device(&default_device_ops))
6499 		goto out;
6500 
6501 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6502 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6503 
6504 	hotcpu_notifier(dev_cpu_callback, 0);
6505 	dst_init();
6506 	dev_mcast_init();
6507 	rc = 0;
6508 out:
6509 	return rc;
6510 }
6511 
6512 subsys_initcall(net_dev_init);
6513 
6514 static int __init initialize_hashrnd(void)
6515 {
6516 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6517 	return 0;
6518 }
6519 
6520 late_initcall_sync(initialize_hashrnd);
6521 
6522