xref: /linux/net/core/dev.c (revision f2ee442115c9b6219083c019939a9cc0c9abb2f8)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 #include <linux/if_tunnel.h>
137 #include <linux/if_pppox.h>
138 #include <linux/ppp_defs.h>
139 #include <linux/net_tstamp.h>
140 
141 #include "net-sysfs.h"
142 
143 /* Instead of increasing this, you should create a hash table. */
144 #define MAX_GRO_SKBS 8
145 
146 /* This should be increased if a protocol with a bigger head is added. */
147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
148 
149 /*
150  *	The list of packet types we will receive (as opposed to discard)
151  *	and the routines to invoke.
152  *
153  *	Why 16. Because with 16 the only overlap we get on a hash of the
154  *	low nibble of the protocol value is RARP/SNAP/X.25.
155  *
156  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
157  *             sure which should go first, but I bet it won't make much
158  *             difference if we are running VLANs.  The good news is that
159  *             this protocol won't be in the list unless compiled in, so
160  *             the average user (w/out VLANs) will not be adversely affected.
161  *             --BLG
162  *
163  *		0800	IP
164  *		8100    802.1Q VLAN
165  *		0001	802.3
166  *		0002	AX.25
167  *		0004	802.2
168  *		8035	RARP
169  *		0005	SNAP
170  *		0805	X.25
171  *		0806	ARP
172  *		8137	IPX
173  *		0009	Localtalk
174  *		86DD	IPv6
175  */
176 
177 #define PTYPE_HASH_SIZE	(16)
178 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
179 
180 static DEFINE_SPINLOCK(ptype_lock);
181 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
182 static struct list_head ptype_all __read_mostly;	/* Taps */
183 
184 /*
185  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
186  * semaphore.
187  *
188  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
189  *
190  * Writers must hold the rtnl semaphore while they loop through the
191  * dev_base_head list, and hold dev_base_lock for writing when they do the
192  * actual updates.  This allows pure readers to access the list even
193  * while a writer is preparing to update it.
194  *
195  * To put it another way, dev_base_lock is held for writing only to
196  * protect against pure readers; the rtnl semaphore provides the
197  * protection against other writers.
198  *
199  * See, for example usages, register_netdevice() and
200  * unregister_netdevice(), which must be called with the rtnl
201  * semaphore held.
202  */
203 DEFINE_RWLOCK(dev_base_lock);
204 EXPORT_SYMBOL(dev_base_lock);
205 
206 static inline void dev_base_seq_inc(struct net *net)
207 {
208 	while (++net->dev_base_seq == 0);
209 }
210 
211 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
212 {
213 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
214 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
215 }
216 
217 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
218 {
219 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
220 }
221 
222 static inline void rps_lock(struct softnet_data *sd)
223 {
224 #ifdef CONFIG_RPS
225 	spin_lock(&sd->input_pkt_queue.lock);
226 #endif
227 }
228 
229 static inline void rps_unlock(struct softnet_data *sd)
230 {
231 #ifdef CONFIG_RPS
232 	spin_unlock(&sd->input_pkt_queue.lock);
233 #endif
234 }
235 
236 /* Device list insertion */
237 static int list_netdevice(struct net_device *dev)
238 {
239 	struct net *net = dev_net(dev);
240 
241 	ASSERT_RTNL();
242 
243 	write_lock_bh(&dev_base_lock);
244 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
245 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
246 	hlist_add_head_rcu(&dev->index_hlist,
247 			   dev_index_hash(net, dev->ifindex));
248 	write_unlock_bh(&dev_base_lock);
249 
250 	dev_base_seq_inc(net);
251 
252 	return 0;
253 }
254 
255 /* Device list removal
256  * caller must respect a RCU grace period before freeing/reusing dev
257  */
258 static void unlist_netdevice(struct net_device *dev)
259 {
260 	ASSERT_RTNL();
261 
262 	/* Unlink dev from the device chain */
263 	write_lock_bh(&dev_base_lock);
264 	list_del_rcu(&dev->dev_list);
265 	hlist_del_rcu(&dev->name_hlist);
266 	hlist_del_rcu(&dev->index_hlist);
267 	write_unlock_bh(&dev_base_lock);
268 
269 	dev_base_seq_inc(dev_net(dev));
270 }
271 
272 /*
273  *	Our notifier list
274  */
275 
276 static RAW_NOTIFIER_HEAD(netdev_chain);
277 
278 /*
279  *	Device drivers call our routines to queue packets here. We empty the
280  *	queue in the local softnet handler.
281  */
282 
283 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
284 EXPORT_PER_CPU_SYMBOL(softnet_data);
285 
286 #ifdef CONFIG_LOCKDEP
287 /*
288  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
289  * according to dev->type
290  */
291 static const unsigned short netdev_lock_type[] =
292 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
293 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
294 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
295 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
296 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
297 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
298 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
299 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
300 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
301 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
302 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
303 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
304 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
305 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
306 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
307 	 ARPHRD_VOID, ARPHRD_NONE};
308 
309 static const char *const netdev_lock_name[] =
310 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
311 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
312 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
313 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
314 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
315 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
316 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
317 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
318 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
319 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
320 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
321 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
322 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
323 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
324 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
325 	 "_xmit_VOID", "_xmit_NONE"};
326 
327 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
328 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
329 
330 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
331 {
332 	int i;
333 
334 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
335 		if (netdev_lock_type[i] == dev_type)
336 			return i;
337 	/* the last key is used by default */
338 	return ARRAY_SIZE(netdev_lock_type) - 1;
339 }
340 
341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
342 						 unsigned short dev_type)
343 {
344 	int i;
345 
346 	i = netdev_lock_pos(dev_type);
347 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
348 				   netdev_lock_name[i]);
349 }
350 
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 	int i;
354 
355 	i = netdev_lock_pos(dev->type);
356 	lockdep_set_class_and_name(&dev->addr_list_lock,
357 				   &netdev_addr_lock_key[i],
358 				   netdev_lock_name[i]);
359 }
360 #else
361 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
362 						 unsigned short dev_type)
363 {
364 }
365 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
366 {
367 }
368 #endif
369 
370 /*******************************************************************************
371 
372 		Protocol management and registration routines
373 
374 *******************************************************************************/
375 
376 /*
377  *	Add a protocol ID to the list. Now that the input handler is
378  *	smarter we can dispense with all the messy stuff that used to be
379  *	here.
380  *
381  *	BEWARE!!! Protocol handlers, mangling input packets,
382  *	MUST BE last in hash buckets and checking protocol handlers
383  *	MUST start from promiscuous ptype_all chain in net_bh.
384  *	It is true now, do not change it.
385  *	Explanation follows: if protocol handler, mangling packet, will
386  *	be the first on list, it is not able to sense, that packet
387  *	is cloned and should be copied-on-write, so that it will
388  *	change it and subsequent readers will get broken packet.
389  *							--ANK (980803)
390  */
391 
392 static inline struct list_head *ptype_head(const struct packet_type *pt)
393 {
394 	if (pt->type == htons(ETH_P_ALL))
395 		return &ptype_all;
396 	else
397 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
398 }
399 
400 /**
401  *	dev_add_pack - add packet handler
402  *	@pt: packet type declaration
403  *
404  *	Add a protocol handler to the networking stack. The passed &packet_type
405  *	is linked into kernel lists and may not be freed until it has been
406  *	removed from the kernel lists.
407  *
408  *	This call does not sleep therefore it can not
409  *	guarantee all CPU's that are in middle of receiving packets
410  *	will see the new packet type (until the next received packet).
411  */
412 
413 void dev_add_pack(struct packet_type *pt)
414 {
415 	struct list_head *head = ptype_head(pt);
416 
417 	spin_lock(&ptype_lock);
418 	list_add_rcu(&pt->list, head);
419 	spin_unlock(&ptype_lock);
420 }
421 EXPORT_SYMBOL(dev_add_pack);
422 
423 /**
424  *	__dev_remove_pack	 - remove packet handler
425  *	@pt: packet type declaration
426  *
427  *	Remove a protocol handler that was previously added to the kernel
428  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
429  *	from the kernel lists and can be freed or reused once this function
430  *	returns.
431  *
432  *      The packet type might still be in use by receivers
433  *	and must not be freed until after all the CPU's have gone
434  *	through a quiescent state.
435  */
436 void __dev_remove_pack(struct packet_type *pt)
437 {
438 	struct list_head *head = ptype_head(pt);
439 	struct packet_type *pt1;
440 
441 	spin_lock(&ptype_lock);
442 
443 	list_for_each_entry(pt1, head, list) {
444 		if (pt == pt1) {
445 			list_del_rcu(&pt->list);
446 			goto out;
447 		}
448 	}
449 
450 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
451 out:
452 	spin_unlock(&ptype_lock);
453 }
454 EXPORT_SYMBOL(__dev_remove_pack);
455 
456 /**
457  *	dev_remove_pack	 - remove packet handler
458  *	@pt: packet type declaration
459  *
460  *	Remove a protocol handler that was previously added to the kernel
461  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
462  *	from the kernel lists and can be freed or reused once this function
463  *	returns.
464  *
465  *	This call sleeps to guarantee that no CPU is looking at the packet
466  *	type after return.
467  */
468 void dev_remove_pack(struct packet_type *pt)
469 {
470 	__dev_remove_pack(pt);
471 
472 	synchronize_net();
473 }
474 EXPORT_SYMBOL(dev_remove_pack);
475 
476 /******************************************************************************
477 
478 		      Device Boot-time Settings Routines
479 
480 *******************************************************************************/
481 
482 /* Boot time configuration table */
483 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
484 
485 /**
486  *	netdev_boot_setup_add	- add new setup entry
487  *	@name: name of the device
488  *	@map: configured settings for the device
489  *
490  *	Adds new setup entry to the dev_boot_setup list.  The function
491  *	returns 0 on error and 1 on success.  This is a generic routine to
492  *	all netdevices.
493  */
494 static int netdev_boot_setup_add(char *name, struct ifmap *map)
495 {
496 	struct netdev_boot_setup *s;
497 	int i;
498 
499 	s = dev_boot_setup;
500 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
501 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
502 			memset(s[i].name, 0, sizeof(s[i].name));
503 			strlcpy(s[i].name, name, IFNAMSIZ);
504 			memcpy(&s[i].map, map, sizeof(s[i].map));
505 			break;
506 		}
507 	}
508 
509 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
510 }
511 
512 /**
513  *	netdev_boot_setup_check	- check boot time settings
514  *	@dev: the netdevice
515  *
516  * 	Check boot time settings for the device.
517  *	The found settings are set for the device to be used
518  *	later in the device probing.
519  *	Returns 0 if no settings found, 1 if they are.
520  */
521 int netdev_boot_setup_check(struct net_device *dev)
522 {
523 	struct netdev_boot_setup *s = dev_boot_setup;
524 	int i;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
527 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
528 		    !strcmp(dev->name, s[i].name)) {
529 			dev->irq 	= s[i].map.irq;
530 			dev->base_addr 	= s[i].map.base_addr;
531 			dev->mem_start 	= s[i].map.mem_start;
532 			dev->mem_end 	= s[i].map.mem_end;
533 			return 1;
534 		}
535 	}
536 	return 0;
537 }
538 EXPORT_SYMBOL(netdev_boot_setup_check);
539 
540 
541 /**
542  *	netdev_boot_base	- get address from boot time settings
543  *	@prefix: prefix for network device
544  *	@unit: id for network device
545  *
546  * 	Check boot time settings for the base address of device.
547  *	The found settings are set for the device to be used
548  *	later in the device probing.
549  *	Returns 0 if no settings found.
550  */
551 unsigned long netdev_boot_base(const char *prefix, int unit)
552 {
553 	const struct netdev_boot_setup *s = dev_boot_setup;
554 	char name[IFNAMSIZ];
555 	int i;
556 
557 	sprintf(name, "%s%d", prefix, unit);
558 
559 	/*
560 	 * If device already registered then return base of 1
561 	 * to indicate not to probe for this interface
562 	 */
563 	if (__dev_get_by_name(&init_net, name))
564 		return 1;
565 
566 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
567 		if (!strcmp(name, s[i].name))
568 			return s[i].map.base_addr;
569 	return 0;
570 }
571 
572 /*
573  * Saves at boot time configured settings for any netdevice.
574  */
575 int __init netdev_boot_setup(char *str)
576 {
577 	int ints[5];
578 	struct ifmap map;
579 
580 	str = get_options(str, ARRAY_SIZE(ints), ints);
581 	if (!str || !*str)
582 		return 0;
583 
584 	/* Save settings */
585 	memset(&map, 0, sizeof(map));
586 	if (ints[0] > 0)
587 		map.irq = ints[1];
588 	if (ints[0] > 1)
589 		map.base_addr = ints[2];
590 	if (ints[0] > 2)
591 		map.mem_start = ints[3];
592 	if (ints[0] > 3)
593 		map.mem_end = ints[4];
594 
595 	/* Add new entry to the list */
596 	return netdev_boot_setup_add(str, &map);
597 }
598 
599 __setup("netdev=", netdev_boot_setup);
600 
601 /*******************************************************************************
602 
603 			    Device Interface Subroutines
604 
605 *******************************************************************************/
606 
607 /**
608  *	__dev_get_by_name	- find a device by its name
609  *	@net: the applicable net namespace
610  *	@name: name to find
611  *
612  *	Find an interface by name. Must be called under RTNL semaphore
613  *	or @dev_base_lock. If the name is found a pointer to the device
614  *	is returned. If the name is not found then %NULL is returned. The
615  *	reference counters are not incremented so the caller must be
616  *	careful with locks.
617  */
618 
619 struct net_device *__dev_get_by_name(struct net *net, const char *name)
620 {
621 	struct hlist_node *p;
622 	struct net_device *dev;
623 	struct hlist_head *head = dev_name_hash(net, name);
624 
625 	hlist_for_each_entry(dev, p, head, name_hlist)
626 		if (!strncmp(dev->name, name, IFNAMSIZ))
627 			return dev;
628 
629 	return NULL;
630 }
631 EXPORT_SYMBOL(__dev_get_by_name);
632 
633 /**
634  *	dev_get_by_name_rcu	- find a device by its name
635  *	@net: the applicable net namespace
636  *	@name: name to find
637  *
638  *	Find an interface by name.
639  *	If the name is found a pointer to the device is returned.
640  * 	If the name is not found then %NULL is returned.
641  *	The reference counters are not incremented so the caller must be
642  *	careful with locks. The caller must hold RCU lock.
643  */
644 
645 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
646 {
647 	struct hlist_node *p;
648 	struct net_device *dev;
649 	struct hlist_head *head = dev_name_hash(net, name);
650 
651 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
652 		if (!strncmp(dev->name, name, IFNAMSIZ))
653 			return dev;
654 
655 	return NULL;
656 }
657 EXPORT_SYMBOL(dev_get_by_name_rcu);
658 
659 /**
660  *	dev_get_by_name		- find a device by its name
661  *	@net: the applicable net namespace
662  *	@name: name to find
663  *
664  *	Find an interface by name. This can be called from any
665  *	context and does its own locking. The returned handle has
666  *	the usage count incremented and the caller must use dev_put() to
667  *	release it when it is no longer needed. %NULL is returned if no
668  *	matching device is found.
669  */
670 
671 struct net_device *dev_get_by_name(struct net *net, const char *name)
672 {
673 	struct net_device *dev;
674 
675 	rcu_read_lock();
676 	dev = dev_get_by_name_rcu(net, name);
677 	if (dev)
678 		dev_hold(dev);
679 	rcu_read_unlock();
680 	return dev;
681 }
682 EXPORT_SYMBOL(dev_get_by_name);
683 
684 /**
685  *	__dev_get_by_index - find a device by its ifindex
686  *	@net: the applicable net namespace
687  *	@ifindex: index of device
688  *
689  *	Search for an interface by index. Returns %NULL if the device
690  *	is not found or a pointer to the device. The device has not
691  *	had its reference counter increased so the caller must be careful
692  *	about locking. The caller must hold either the RTNL semaphore
693  *	or @dev_base_lock.
694  */
695 
696 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
697 {
698 	struct hlist_node *p;
699 	struct net_device *dev;
700 	struct hlist_head *head = dev_index_hash(net, ifindex);
701 
702 	hlist_for_each_entry(dev, p, head, index_hlist)
703 		if (dev->ifindex == ifindex)
704 			return dev;
705 
706 	return NULL;
707 }
708 EXPORT_SYMBOL(__dev_get_by_index);
709 
710 /**
711  *	dev_get_by_index_rcu - find a device by its ifindex
712  *	@net: the applicable net namespace
713  *	@ifindex: index of device
714  *
715  *	Search for an interface by index. Returns %NULL if the device
716  *	is not found or a pointer to the device. The device has not
717  *	had its reference counter increased so the caller must be careful
718  *	about locking. The caller must hold RCU lock.
719  */
720 
721 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
722 {
723 	struct hlist_node *p;
724 	struct net_device *dev;
725 	struct hlist_head *head = dev_index_hash(net, ifindex);
726 
727 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
728 		if (dev->ifindex == ifindex)
729 			return dev;
730 
731 	return NULL;
732 }
733 EXPORT_SYMBOL(dev_get_by_index_rcu);
734 
735 
736 /**
737  *	dev_get_by_index - find a device by its ifindex
738  *	@net: the applicable net namespace
739  *	@ifindex: index of device
740  *
741  *	Search for an interface by index. Returns NULL if the device
742  *	is not found or a pointer to the device. The device returned has
743  *	had a reference added and the pointer is safe until the user calls
744  *	dev_put to indicate they have finished with it.
745  */
746 
747 struct net_device *dev_get_by_index(struct net *net, int ifindex)
748 {
749 	struct net_device *dev;
750 
751 	rcu_read_lock();
752 	dev = dev_get_by_index_rcu(net, ifindex);
753 	if (dev)
754 		dev_hold(dev);
755 	rcu_read_unlock();
756 	return dev;
757 }
758 EXPORT_SYMBOL(dev_get_by_index);
759 
760 /**
761  *	dev_getbyhwaddr_rcu - find a device by its hardware address
762  *	@net: the applicable net namespace
763  *	@type: media type of device
764  *	@ha: hardware address
765  *
766  *	Search for an interface by MAC address. Returns NULL if the device
767  *	is not found or a pointer to the device.
768  *	The caller must hold RCU or RTNL.
769  *	The returned device has not had its ref count increased
770  *	and the caller must therefore be careful about locking
771  *
772  */
773 
774 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
775 				       const char *ha)
776 {
777 	struct net_device *dev;
778 
779 	for_each_netdev_rcu(net, dev)
780 		if (dev->type == type &&
781 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
782 			return dev;
783 
784 	return NULL;
785 }
786 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
787 
788 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
789 {
790 	struct net_device *dev;
791 
792 	ASSERT_RTNL();
793 	for_each_netdev(net, dev)
794 		if (dev->type == type)
795 			return dev;
796 
797 	return NULL;
798 }
799 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
800 
801 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
802 {
803 	struct net_device *dev, *ret = NULL;
804 
805 	rcu_read_lock();
806 	for_each_netdev_rcu(net, dev)
807 		if (dev->type == type) {
808 			dev_hold(dev);
809 			ret = dev;
810 			break;
811 		}
812 	rcu_read_unlock();
813 	return ret;
814 }
815 EXPORT_SYMBOL(dev_getfirstbyhwtype);
816 
817 /**
818  *	dev_get_by_flags_rcu - find any device with given flags
819  *	@net: the applicable net namespace
820  *	@if_flags: IFF_* values
821  *	@mask: bitmask of bits in if_flags to check
822  *
823  *	Search for any interface with the given flags. Returns NULL if a device
824  *	is not found or a pointer to the device. Must be called inside
825  *	rcu_read_lock(), and result refcount is unchanged.
826  */
827 
828 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
829 				    unsigned short mask)
830 {
831 	struct net_device *dev, *ret;
832 
833 	ret = NULL;
834 	for_each_netdev_rcu(net, dev) {
835 		if (((dev->flags ^ if_flags) & mask) == 0) {
836 			ret = dev;
837 			break;
838 		}
839 	}
840 	return ret;
841 }
842 EXPORT_SYMBOL(dev_get_by_flags_rcu);
843 
844 /**
845  *	dev_valid_name - check if name is okay for network device
846  *	@name: name string
847  *
848  *	Network device names need to be valid file names to
849  *	to allow sysfs to work.  We also disallow any kind of
850  *	whitespace.
851  */
852 int dev_valid_name(const char *name)
853 {
854 	if (*name == '\0')
855 		return 0;
856 	if (strlen(name) >= IFNAMSIZ)
857 		return 0;
858 	if (!strcmp(name, ".") || !strcmp(name, ".."))
859 		return 0;
860 
861 	while (*name) {
862 		if (*name == '/' || isspace(*name))
863 			return 0;
864 		name++;
865 	}
866 	return 1;
867 }
868 EXPORT_SYMBOL(dev_valid_name);
869 
870 /**
871  *	__dev_alloc_name - allocate a name for a device
872  *	@net: network namespace to allocate the device name in
873  *	@name: name format string
874  *	@buf:  scratch buffer and result name string
875  *
876  *	Passed a format string - eg "lt%d" it will try and find a suitable
877  *	id. It scans list of devices to build up a free map, then chooses
878  *	the first empty slot. The caller must hold the dev_base or rtnl lock
879  *	while allocating the name and adding the device in order to avoid
880  *	duplicates.
881  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
882  *	Returns the number of the unit assigned or a negative errno code.
883  */
884 
885 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
886 {
887 	int i = 0;
888 	const char *p;
889 	const int max_netdevices = 8*PAGE_SIZE;
890 	unsigned long *inuse;
891 	struct net_device *d;
892 
893 	p = strnchr(name, IFNAMSIZ-1, '%');
894 	if (p) {
895 		/*
896 		 * Verify the string as this thing may have come from
897 		 * the user.  There must be either one "%d" and no other "%"
898 		 * characters.
899 		 */
900 		if (p[1] != 'd' || strchr(p + 2, '%'))
901 			return -EINVAL;
902 
903 		/* Use one page as a bit array of possible slots */
904 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
905 		if (!inuse)
906 			return -ENOMEM;
907 
908 		for_each_netdev(net, d) {
909 			if (!sscanf(d->name, name, &i))
910 				continue;
911 			if (i < 0 || i >= max_netdevices)
912 				continue;
913 
914 			/*  avoid cases where sscanf is not exact inverse of printf */
915 			snprintf(buf, IFNAMSIZ, name, i);
916 			if (!strncmp(buf, d->name, IFNAMSIZ))
917 				set_bit(i, inuse);
918 		}
919 
920 		i = find_first_zero_bit(inuse, max_netdevices);
921 		free_page((unsigned long) inuse);
922 	}
923 
924 	if (buf != name)
925 		snprintf(buf, IFNAMSIZ, name, i);
926 	if (!__dev_get_by_name(net, buf))
927 		return i;
928 
929 	/* It is possible to run out of possible slots
930 	 * when the name is long and there isn't enough space left
931 	 * for the digits, or if all bits are used.
932 	 */
933 	return -ENFILE;
934 }
935 
936 /**
937  *	dev_alloc_name - allocate a name for a device
938  *	@dev: device
939  *	@name: name format string
940  *
941  *	Passed a format string - eg "lt%d" it will try and find a suitable
942  *	id. It scans list of devices to build up a free map, then chooses
943  *	the first empty slot. The caller must hold the dev_base or rtnl lock
944  *	while allocating the name and adding the device in order to avoid
945  *	duplicates.
946  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
947  *	Returns the number of the unit assigned or a negative errno code.
948  */
949 
950 int dev_alloc_name(struct net_device *dev, const char *name)
951 {
952 	char buf[IFNAMSIZ];
953 	struct net *net;
954 	int ret;
955 
956 	BUG_ON(!dev_net(dev));
957 	net = dev_net(dev);
958 	ret = __dev_alloc_name(net, name, buf);
959 	if (ret >= 0)
960 		strlcpy(dev->name, buf, IFNAMSIZ);
961 	return ret;
962 }
963 EXPORT_SYMBOL(dev_alloc_name);
964 
965 static int dev_get_valid_name(struct net_device *dev, const char *name)
966 {
967 	struct net *net;
968 
969 	BUG_ON(!dev_net(dev));
970 	net = dev_net(dev);
971 
972 	if (!dev_valid_name(name))
973 		return -EINVAL;
974 
975 	if (strchr(name, '%'))
976 		return dev_alloc_name(dev, name);
977 	else if (__dev_get_by_name(net, name))
978 		return -EEXIST;
979 	else if (dev->name != name)
980 		strlcpy(dev->name, name, IFNAMSIZ);
981 
982 	return 0;
983 }
984 
985 /**
986  *	dev_change_name - change name of a device
987  *	@dev: device
988  *	@newname: name (or format string) must be at least IFNAMSIZ
989  *
990  *	Change name of a device, can pass format strings "eth%d".
991  *	for wildcarding.
992  */
993 int dev_change_name(struct net_device *dev, const char *newname)
994 {
995 	char oldname[IFNAMSIZ];
996 	int err = 0;
997 	int ret;
998 	struct net *net;
999 
1000 	ASSERT_RTNL();
1001 	BUG_ON(!dev_net(dev));
1002 
1003 	net = dev_net(dev);
1004 	if (dev->flags & IFF_UP)
1005 		return -EBUSY;
1006 
1007 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1008 		return 0;
1009 
1010 	memcpy(oldname, dev->name, IFNAMSIZ);
1011 
1012 	err = dev_get_valid_name(dev, newname);
1013 	if (err < 0)
1014 		return err;
1015 
1016 rollback:
1017 	ret = device_rename(&dev->dev, dev->name);
1018 	if (ret) {
1019 		memcpy(dev->name, oldname, IFNAMSIZ);
1020 		return ret;
1021 	}
1022 
1023 	write_lock_bh(&dev_base_lock);
1024 	hlist_del_rcu(&dev->name_hlist);
1025 	write_unlock_bh(&dev_base_lock);
1026 
1027 	synchronize_rcu();
1028 
1029 	write_lock_bh(&dev_base_lock);
1030 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1031 	write_unlock_bh(&dev_base_lock);
1032 
1033 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1034 	ret = notifier_to_errno(ret);
1035 
1036 	if (ret) {
1037 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1038 		if (err >= 0) {
1039 			err = ret;
1040 			memcpy(dev->name, oldname, IFNAMSIZ);
1041 			goto rollback;
1042 		} else {
1043 			printk(KERN_ERR
1044 			       "%s: name change rollback failed: %d.\n",
1045 			       dev->name, ret);
1046 		}
1047 	}
1048 
1049 	return err;
1050 }
1051 
1052 /**
1053  *	dev_set_alias - change ifalias of a device
1054  *	@dev: device
1055  *	@alias: name up to IFALIASZ
1056  *	@len: limit of bytes to copy from info
1057  *
1058  *	Set ifalias for a device,
1059  */
1060 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1061 {
1062 	ASSERT_RTNL();
1063 
1064 	if (len >= IFALIASZ)
1065 		return -EINVAL;
1066 
1067 	if (!len) {
1068 		if (dev->ifalias) {
1069 			kfree(dev->ifalias);
1070 			dev->ifalias = NULL;
1071 		}
1072 		return 0;
1073 	}
1074 
1075 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1076 	if (!dev->ifalias)
1077 		return -ENOMEM;
1078 
1079 	strlcpy(dev->ifalias, alias, len+1);
1080 	return len;
1081 }
1082 
1083 
1084 /**
1085  *	netdev_features_change - device changes features
1086  *	@dev: device to cause notification
1087  *
1088  *	Called to indicate a device has changed features.
1089  */
1090 void netdev_features_change(struct net_device *dev)
1091 {
1092 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1093 }
1094 EXPORT_SYMBOL(netdev_features_change);
1095 
1096 /**
1097  *	netdev_state_change - device changes state
1098  *	@dev: device to cause notification
1099  *
1100  *	Called to indicate a device has changed state. This function calls
1101  *	the notifier chains for netdev_chain and sends a NEWLINK message
1102  *	to the routing socket.
1103  */
1104 void netdev_state_change(struct net_device *dev)
1105 {
1106 	if (dev->flags & IFF_UP) {
1107 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1108 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1109 	}
1110 }
1111 EXPORT_SYMBOL(netdev_state_change);
1112 
1113 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1114 {
1115 	return call_netdevice_notifiers(event, dev);
1116 }
1117 EXPORT_SYMBOL(netdev_bonding_change);
1118 
1119 /**
1120  *	dev_load 	- load a network module
1121  *	@net: the applicable net namespace
1122  *	@name: name of interface
1123  *
1124  *	If a network interface is not present and the process has suitable
1125  *	privileges this function loads the module. If module loading is not
1126  *	available in this kernel then it becomes a nop.
1127  */
1128 
1129 void dev_load(struct net *net, const char *name)
1130 {
1131 	struct net_device *dev;
1132 	int no_module;
1133 
1134 	rcu_read_lock();
1135 	dev = dev_get_by_name_rcu(net, name);
1136 	rcu_read_unlock();
1137 
1138 	no_module = !dev;
1139 	if (no_module && capable(CAP_NET_ADMIN))
1140 		no_module = request_module("netdev-%s", name);
1141 	if (no_module && capable(CAP_SYS_MODULE)) {
1142 		if (!request_module("%s", name))
1143 			pr_err("Loading kernel module for a network device "
1144 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1145 "instead\n", name);
1146 	}
1147 }
1148 EXPORT_SYMBOL(dev_load);
1149 
1150 static int __dev_open(struct net_device *dev)
1151 {
1152 	const struct net_device_ops *ops = dev->netdev_ops;
1153 	int ret;
1154 
1155 	ASSERT_RTNL();
1156 
1157 	if (!netif_device_present(dev))
1158 		return -ENODEV;
1159 
1160 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1161 	ret = notifier_to_errno(ret);
1162 	if (ret)
1163 		return ret;
1164 
1165 	set_bit(__LINK_STATE_START, &dev->state);
1166 
1167 	if (ops->ndo_validate_addr)
1168 		ret = ops->ndo_validate_addr(dev);
1169 
1170 	if (!ret && ops->ndo_open)
1171 		ret = ops->ndo_open(dev);
1172 
1173 	if (ret)
1174 		clear_bit(__LINK_STATE_START, &dev->state);
1175 	else {
1176 		dev->flags |= IFF_UP;
1177 		net_dmaengine_get();
1178 		dev_set_rx_mode(dev);
1179 		dev_activate(dev);
1180 	}
1181 
1182 	return ret;
1183 }
1184 
1185 /**
1186  *	dev_open	- prepare an interface for use.
1187  *	@dev:	device to open
1188  *
1189  *	Takes a device from down to up state. The device's private open
1190  *	function is invoked and then the multicast lists are loaded. Finally
1191  *	the device is moved into the up state and a %NETDEV_UP message is
1192  *	sent to the netdev notifier chain.
1193  *
1194  *	Calling this function on an active interface is a nop. On a failure
1195  *	a negative errno code is returned.
1196  */
1197 int dev_open(struct net_device *dev)
1198 {
1199 	int ret;
1200 
1201 	if (dev->flags & IFF_UP)
1202 		return 0;
1203 
1204 	ret = __dev_open(dev);
1205 	if (ret < 0)
1206 		return ret;
1207 
1208 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1209 	call_netdevice_notifiers(NETDEV_UP, dev);
1210 
1211 	return ret;
1212 }
1213 EXPORT_SYMBOL(dev_open);
1214 
1215 static int __dev_close_many(struct list_head *head)
1216 {
1217 	struct net_device *dev;
1218 
1219 	ASSERT_RTNL();
1220 	might_sleep();
1221 
1222 	list_for_each_entry(dev, head, unreg_list) {
1223 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1224 
1225 		clear_bit(__LINK_STATE_START, &dev->state);
1226 
1227 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1228 		 * can be even on different cpu. So just clear netif_running().
1229 		 *
1230 		 * dev->stop() will invoke napi_disable() on all of it's
1231 		 * napi_struct instances on this device.
1232 		 */
1233 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1234 	}
1235 
1236 	dev_deactivate_many(head);
1237 
1238 	list_for_each_entry(dev, head, unreg_list) {
1239 		const struct net_device_ops *ops = dev->netdev_ops;
1240 
1241 		/*
1242 		 *	Call the device specific close. This cannot fail.
1243 		 *	Only if device is UP
1244 		 *
1245 		 *	We allow it to be called even after a DETACH hot-plug
1246 		 *	event.
1247 		 */
1248 		if (ops->ndo_stop)
1249 			ops->ndo_stop(dev);
1250 
1251 		dev->flags &= ~IFF_UP;
1252 		net_dmaengine_put();
1253 	}
1254 
1255 	return 0;
1256 }
1257 
1258 static int __dev_close(struct net_device *dev)
1259 {
1260 	int retval;
1261 	LIST_HEAD(single);
1262 
1263 	list_add(&dev->unreg_list, &single);
1264 	retval = __dev_close_many(&single);
1265 	list_del(&single);
1266 	return retval;
1267 }
1268 
1269 static int dev_close_many(struct list_head *head)
1270 {
1271 	struct net_device *dev, *tmp;
1272 	LIST_HEAD(tmp_list);
1273 
1274 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1275 		if (!(dev->flags & IFF_UP))
1276 			list_move(&dev->unreg_list, &tmp_list);
1277 
1278 	__dev_close_many(head);
1279 
1280 	list_for_each_entry(dev, head, unreg_list) {
1281 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1282 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1283 	}
1284 
1285 	/* rollback_registered_many needs the complete original list */
1286 	list_splice(&tmp_list, head);
1287 	return 0;
1288 }
1289 
1290 /**
1291  *	dev_close - shutdown an interface.
1292  *	@dev: device to shutdown
1293  *
1294  *	This function moves an active device into down state. A
1295  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1296  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1297  *	chain.
1298  */
1299 int dev_close(struct net_device *dev)
1300 {
1301 	if (dev->flags & IFF_UP) {
1302 		LIST_HEAD(single);
1303 
1304 		list_add(&dev->unreg_list, &single);
1305 		dev_close_many(&single);
1306 		list_del(&single);
1307 	}
1308 	return 0;
1309 }
1310 EXPORT_SYMBOL(dev_close);
1311 
1312 
1313 /**
1314  *	dev_disable_lro - disable Large Receive Offload on a device
1315  *	@dev: device
1316  *
1317  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1318  *	called under RTNL.  This is needed if received packets may be
1319  *	forwarded to another interface.
1320  */
1321 void dev_disable_lro(struct net_device *dev)
1322 {
1323 	u32 flags;
1324 
1325 	/*
1326 	 * If we're trying to disable lro on a vlan device
1327 	 * use the underlying physical device instead
1328 	 */
1329 	if (is_vlan_dev(dev))
1330 		dev = vlan_dev_real_dev(dev);
1331 
1332 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1333 		flags = dev->ethtool_ops->get_flags(dev);
1334 	else
1335 		flags = ethtool_op_get_flags(dev);
1336 
1337 	if (!(flags & ETH_FLAG_LRO))
1338 		return;
1339 
1340 	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1341 	if (unlikely(dev->features & NETIF_F_LRO))
1342 		netdev_WARN(dev, "failed to disable LRO!\n");
1343 }
1344 EXPORT_SYMBOL(dev_disable_lro);
1345 
1346 
1347 static int dev_boot_phase = 1;
1348 
1349 /**
1350  *	register_netdevice_notifier - register a network notifier block
1351  *	@nb: notifier
1352  *
1353  *	Register a notifier to be called when network device events occur.
1354  *	The notifier passed is linked into the kernel structures and must
1355  *	not be reused until it has been unregistered. A negative errno code
1356  *	is returned on a failure.
1357  *
1358  * 	When registered all registration and up events are replayed
1359  *	to the new notifier to allow device to have a race free
1360  *	view of the network device list.
1361  */
1362 
1363 int register_netdevice_notifier(struct notifier_block *nb)
1364 {
1365 	struct net_device *dev;
1366 	struct net_device *last;
1367 	struct net *net;
1368 	int err;
1369 
1370 	rtnl_lock();
1371 	err = raw_notifier_chain_register(&netdev_chain, nb);
1372 	if (err)
1373 		goto unlock;
1374 	if (dev_boot_phase)
1375 		goto unlock;
1376 	for_each_net(net) {
1377 		for_each_netdev(net, dev) {
1378 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1379 			err = notifier_to_errno(err);
1380 			if (err)
1381 				goto rollback;
1382 
1383 			if (!(dev->flags & IFF_UP))
1384 				continue;
1385 
1386 			nb->notifier_call(nb, NETDEV_UP, dev);
1387 		}
1388 	}
1389 
1390 unlock:
1391 	rtnl_unlock();
1392 	return err;
1393 
1394 rollback:
1395 	last = dev;
1396 	for_each_net(net) {
1397 		for_each_netdev(net, dev) {
1398 			if (dev == last)
1399 				break;
1400 
1401 			if (dev->flags & IFF_UP) {
1402 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1403 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1404 			}
1405 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1406 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1407 		}
1408 	}
1409 
1410 	raw_notifier_chain_unregister(&netdev_chain, nb);
1411 	goto unlock;
1412 }
1413 EXPORT_SYMBOL(register_netdevice_notifier);
1414 
1415 /**
1416  *	unregister_netdevice_notifier - unregister a network notifier block
1417  *	@nb: notifier
1418  *
1419  *	Unregister a notifier previously registered by
1420  *	register_netdevice_notifier(). The notifier is unlinked into the
1421  *	kernel structures and may then be reused. A negative errno code
1422  *	is returned on a failure.
1423  */
1424 
1425 int unregister_netdevice_notifier(struct notifier_block *nb)
1426 {
1427 	int err;
1428 
1429 	rtnl_lock();
1430 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1431 	rtnl_unlock();
1432 	return err;
1433 }
1434 EXPORT_SYMBOL(unregister_netdevice_notifier);
1435 
1436 /**
1437  *	call_netdevice_notifiers - call all network notifier blocks
1438  *      @val: value passed unmodified to notifier function
1439  *      @dev: net_device pointer passed unmodified to notifier function
1440  *
1441  *	Call all network notifier blocks.  Parameters and return value
1442  *	are as for raw_notifier_call_chain().
1443  */
1444 
1445 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1446 {
1447 	ASSERT_RTNL();
1448 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1449 }
1450 EXPORT_SYMBOL(call_netdevice_notifiers);
1451 
1452 /* When > 0 there are consumers of rx skb time stamps */
1453 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1454 
1455 void net_enable_timestamp(void)
1456 {
1457 	atomic_inc(&netstamp_needed);
1458 }
1459 EXPORT_SYMBOL(net_enable_timestamp);
1460 
1461 void net_disable_timestamp(void)
1462 {
1463 	atomic_dec(&netstamp_needed);
1464 }
1465 EXPORT_SYMBOL(net_disable_timestamp);
1466 
1467 static inline void net_timestamp_set(struct sk_buff *skb)
1468 {
1469 	if (atomic_read(&netstamp_needed))
1470 		__net_timestamp(skb);
1471 	else
1472 		skb->tstamp.tv64 = 0;
1473 }
1474 
1475 static inline void net_timestamp_check(struct sk_buff *skb)
1476 {
1477 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1478 		__net_timestamp(skb);
1479 }
1480 
1481 static int net_hwtstamp_validate(struct ifreq *ifr)
1482 {
1483 	struct hwtstamp_config cfg;
1484 	enum hwtstamp_tx_types tx_type;
1485 	enum hwtstamp_rx_filters rx_filter;
1486 	int tx_type_valid = 0;
1487 	int rx_filter_valid = 0;
1488 
1489 	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1490 		return -EFAULT;
1491 
1492 	if (cfg.flags) /* reserved for future extensions */
1493 		return -EINVAL;
1494 
1495 	tx_type = cfg.tx_type;
1496 	rx_filter = cfg.rx_filter;
1497 
1498 	switch (tx_type) {
1499 	case HWTSTAMP_TX_OFF:
1500 	case HWTSTAMP_TX_ON:
1501 	case HWTSTAMP_TX_ONESTEP_SYNC:
1502 		tx_type_valid = 1;
1503 		break;
1504 	}
1505 
1506 	switch (rx_filter) {
1507 	case HWTSTAMP_FILTER_NONE:
1508 	case HWTSTAMP_FILTER_ALL:
1509 	case HWTSTAMP_FILTER_SOME:
1510 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1511 	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1512 	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1513 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1514 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1515 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1516 	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1517 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1518 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1519 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1520 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1521 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1522 		rx_filter_valid = 1;
1523 		break;
1524 	}
1525 
1526 	if (!tx_type_valid || !rx_filter_valid)
1527 		return -ERANGE;
1528 
1529 	return 0;
1530 }
1531 
1532 static inline bool is_skb_forwardable(struct net_device *dev,
1533 				      struct sk_buff *skb)
1534 {
1535 	unsigned int len;
1536 
1537 	if (!(dev->flags & IFF_UP))
1538 		return false;
1539 
1540 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1541 	if (skb->len <= len)
1542 		return true;
1543 
1544 	/* if TSO is enabled, we don't care about the length as the packet
1545 	 * could be forwarded without being segmented before
1546 	 */
1547 	if (skb_is_gso(skb))
1548 		return true;
1549 
1550 	return false;
1551 }
1552 
1553 /**
1554  * dev_forward_skb - loopback an skb to another netif
1555  *
1556  * @dev: destination network device
1557  * @skb: buffer to forward
1558  *
1559  * return values:
1560  *	NET_RX_SUCCESS	(no congestion)
1561  *	NET_RX_DROP     (packet was dropped, but freed)
1562  *
1563  * dev_forward_skb can be used for injecting an skb from the
1564  * start_xmit function of one device into the receive queue
1565  * of another device.
1566  *
1567  * The receiving device may be in another namespace, so
1568  * we have to clear all information in the skb that could
1569  * impact namespace isolation.
1570  */
1571 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1572 {
1573 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1574 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1575 			atomic_long_inc(&dev->rx_dropped);
1576 			kfree_skb(skb);
1577 			return NET_RX_DROP;
1578 		}
1579 	}
1580 
1581 	skb_orphan(skb);
1582 	nf_reset(skb);
1583 
1584 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1585 		atomic_long_inc(&dev->rx_dropped);
1586 		kfree_skb(skb);
1587 		return NET_RX_DROP;
1588 	}
1589 	skb_set_dev(skb, dev);
1590 	skb->tstamp.tv64 = 0;
1591 	skb->pkt_type = PACKET_HOST;
1592 	skb->protocol = eth_type_trans(skb, dev);
1593 	return netif_rx(skb);
1594 }
1595 EXPORT_SYMBOL_GPL(dev_forward_skb);
1596 
1597 static inline int deliver_skb(struct sk_buff *skb,
1598 			      struct packet_type *pt_prev,
1599 			      struct net_device *orig_dev)
1600 {
1601 	atomic_inc(&skb->users);
1602 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1603 }
1604 
1605 /*
1606  *	Support routine. Sends outgoing frames to any network
1607  *	taps currently in use.
1608  */
1609 
1610 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1611 {
1612 	struct packet_type *ptype;
1613 	struct sk_buff *skb2 = NULL;
1614 	struct packet_type *pt_prev = NULL;
1615 
1616 	rcu_read_lock();
1617 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1618 		/* Never send packets back to the socket
1619 		 * they originated from - MvS (miquels@drinkel.ow.org)
1620 		 */
1621 		if ((ptype->dev == dev || !ptype->dev) &&
1622 		    (ptype->af_packet_priv == NULL ||
1623 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1624 			if (pt_prev) {
1625 				deliver_skb(skb2, pt_prev, skb->dev);
1626 				pt_prev = ptype;
1627 				continue;
1628 			}
1629 
1630 			skb2 = skb_clone(skb, GFP_ATOMIC);
1631 			if (!skb2)
1632 				break;
1633 
1634 			net_timestamp_set(skb2);
1635 
1636 			/* skb->nh should be correctly
1637 			   set by sender, so that the second statement is
1638 			   just protection against buggy protocols.
1639 			 */
1640 			skb_reset_mac_header(skb2);
1641 
1642 			if (skb_network_header(skb2) < skb2->data ||
1643 			    skb2->network_header > skb2->tail) {
1644 				if (net_ratelimit())
1645 					printk(KERN_CRIT "protocol %04x is "
1646 					       "buggy, dev %s\n",
1647 					       ntohs(skb2->protocol),
1648 					       dev->name);
1649 				skb_reset_network_header(skb2);
1650 			}
1651 
1652 			skb2->transport_header = skb2->network_header;
1653 			skb2->pkt_type = PACKET_OUTGOING;
1654 			pt_prev = ptype;
1655 		}
1656 	}
1657 	if (pt_prev)
1658 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1659 	rcu_read_unlock();
1660 }
1661 
1662 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1663  * @dev: Network device
1664  * @txq: number of queues available
1665  *
1666  * If real_num_tx_queues is changed the tc mappings may no longer be
1667  * valid. To resolve this verify the tc mapping remains valid and if
1668  * not NULL the mapping. With no priorities mapping to this
1669  * offset/count pair it will no longer be used. In the worst case TC0
1670  * is invalid nothing can be done so disable priority mappings. If is
1671  * expected that drivers will fix this mapping if they can before
1672  * calling netif_set_real_num_tx_queues.
1673  */
1674 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1675 {
1676 	int i;
1677 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1678 
1679 	/* If TC0 is invalidated disable TC mapping */
1680 	if (tc->offset + tc->count > txq) {
1681 		pr_warning("Number of in use tx queues changed "
1682 			   "invalidating tc mappings. Priority "
1683 			   "traffic classification disabled!\n");
1684 		dev->num_tc = 0;
1685 		return;
1686 	}
1687 
1688 	/* Invalidated prio to tc mappings set to TC0 */
1689 	for (i = 1; i < TC_BITMASK + 1; i++) {
1690 		int q = netdev_get_prio_tc_map(dev, i);
1691 
1692 		tc = &dev->tc_to_txq[q];
1693 		if (tc->offset + tc->count > txq) {
1694 			pr_warning("Number of in use tx queues "
1695 				   "changed. Priority %i to tc "
1696 				   "mapping %i is no longer valid "
1697 				   "setting map to 0\n",
1698 				   i, q);
1699 			netdev_set_prio_tc_map(dev, i, 0);
1700 		}
1701 	}
1702 }
1703 
1704 /*
1705  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1706  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1707  */
1708 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1709 {
1710 	int rc;
1711 
1712 	if (txq < 1 || txq > dev->num_tx_queues)
1713 		return -EINVAL;
1714 
1715 	if (dev->reg_state == NETREG_REGISTERED ||
1716 	    dev->reg_state == NETREG_UNREGISTERING) {
1717 		ASSERT_RTNL();
1718 
1719 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1720 						  txq);
1721 		if (rc)
1722 			return rc;
1723 
1724 		if (dev->num_tc)
1725 			netif_setup_tc(dev, txq);
1726 
1727 		if (txq < dev->real_num_tx_queues)
1728 			qdisc_reset_all_tx_gt(dev, txq);
1729 	}
1730 
1731 	dev->real_num_tx_queues = txq;
1732 	return 0;
1733 }
1734 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1735 
1736 #ifdef CONFIG_RPS
1737 /**
1738  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1739  *	@dev: Network device
1740  *	@rxq: Actual number of RX queues
1741  *
1742  *	This must be called either with the rtnl_lock held or before
1743  *	registration of the net device.  Returns 0 on success, or a
1744  *	negative error code.  If called before registration, it always
1745  *	succeeds.
1746  */
1747 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1748 {
1749 	int rc;
1750 
1751 	if (rxq < 1 || rxq > dev->num_rx_queues)
1752 		return -EINVAL;
1753 
1754 	if (dev->reg_state == NETREG_REGISTERED) {
1755 		ASSERT_RTNL();
1756 
1757 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1758 						  rxq);
1759 		if (rc)
1760 			return rc;
1761 	}
1762 
1763 	dev->real_num_rx_queues = rxq;
1764 	return 0;
1765 }
1766 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1767 #endif
1768 
1769 static inline void __netif_reschedule(struct Qdisc *q)
1770 {
1771 	struct softnet_data *sd;
1772 	unsigned long flags;
1773 
1774 	local_irq_save(flags);
1775 	sd = &__get_cpu_var(softnet_data);
1776 	q->next_sched = NULL;
1777 	*sd->output_queue_tailp = q;
1778 	sd->output_queue_tailp = &q->next_sched;
1779 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1780 	local_irq_restore(flags);
1781 }
1782 
1783 void __netif_schedule(struct Qdisc *q)
1784 {
1785 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1786 		__netif_reschedule(q);
1787 }
1788 EXPORT_SYMBOL(__netif_schedule);
1789 
1790 void dev_kfree_skb_irq(struct sk_buff *skb)
1791 {
1792 	if (atomic_dec_and_test(&skb->users)) {
1793 		struct softnet_data *sd;
1794 		unsigned long flags;
1795 
1796 		local_irq_save(flags);
1797 		sd = &__get_cpu_var(softnet_data);
1798 		skb->next = sd->completion_queue;
1799 		sd->completion_queue = skb;
1800 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1801 		local_irq_restore(flags);
1802 	}
1803 }
1804 EXPORT_SYMBOL(dev_kfree_skb_irq);
1805 
1806 void dev_kfree_skb_any(struct sk_buff *skb)
1807 {
1808 	if (in_irq() || irqs_disabled())
1809 		dev_kfree_skb_irq(skb);
1810 	else
1811 		dev_kfree_skb(skb);
1812 }
1813 EXPORT_SYMBOL(dev_kfree_skb_any);
1814 
1815 
1816 /**
1817  * netif_device_detach - mark device as removed
1818  * @dev: network device
1819  *
1820  * Mark device as removed from system and therefore no longer available.
1821  */
1822 void netif_device_detach(struct net_device *dev)
1823 {
1824 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1825 	    netif_running(dev)) {
1826 		netif_tx_stop_all_queues(dev);
1827 	}
1828 }
1829 EXPORT_SYMBOL(netif_device_detach);
1830 
1831 /**
1832  * netif_device_attach - mark device as attached
1833  * @dev: network device
1834  *
1835  * Mark device as attached from system and restart if needed.
1836  */
1837 void netif_device_attach(struct net_device *dev)
1838 {
1839 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1840 	    netif_running(dev)) {
1841 		netif_tx_wake_all_queues(dev);
1842 		__netdev_watchdog_up(dev);
1843 	}
1844 }
1845 EXPORT_SYMBOL(netif_device_attach);
1846 
1847 /**
1848  * skb_dev_set -- assign a new device to a buffer
1849  * @skb: buffer for the new device
1850  * @dev: network device
1851  *
1852  * If an skb is owned by a device already, we have to reset
1853  * all data private to the namespace a device belongs to
1854  * before assigning it a new device.
1855  */
1856 #ifdef CONFIG_NET_NS
1857 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1858 {
1859 	skb_dst_drop(skb);
1860 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1861 		secpath_reset(skb);
1862 		nf_reset(skb);
1863 		skb_init_secmark(skb);
1864 		skb->mark = 0;
1865 		skb->priority = 0;
1866 		skb->nf_trace = 0;
1867 		skb->ipvs_property = 0;
1868 #ifdef CONFIG_NET_SCHED
1869 		skb->tc_index = 0;
1870 #endif
1871 	}
1872 	skb->dev = dev;
1873 }
1874 EXPORT_SYMBOL(skb_set_dev);
1875 #endif /* CONFIG_NET_NS */
1876 
1877 /*
1878  * Invalidate hardware checksum when packet is to be mangled, and
1879  * complete checksum manually on outgoing path.
1880  */
1881 int skb_checksum_help(struct sk_buff *skb)
1882 {
1883 	__wsum csum;
1884 	int ret = 0, offset;
1885 
1886 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1887 		goto out_set_summed;
1888 
1889 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1890 		/* Let GSO fix up the checksum. */
1891 		goto out_set_summed;
1892 	}
1893 
1894 	offset = skb_checksum_start_offset(skb);
1895 	BUG_ON(offset >= skb_headlen(skb));
1896 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1897 
1898 	offset += skb->csum_offset;
1899 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1900 
1901 	if (skb_cloned(skb) &&
1902 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1903 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1904 		if (ret)
1905 			goto out;
1906 	}
1907 
1908 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1909 out_set_summed:
1910 	skb->ip_summed = CHECKSUM_NONE;
1911 out:
1912 	return ret;
1913 }
1914 EXPORT_SYMBOL(skb_checksum_help);
1915 
1916 /**
1917  *	skb_gso_segment - Perform segmentation on skb.
1918  *	@skb: buffer to segment
1919  *	@features: features for the output path (see dev->features)
1920  *
1921  *	This function segments the given skb and returns a list of segments.
1922  *
1923  *	It may return NULL if the skb requires no segmentation.  This is
1924  *	only possible when GSO is used for verifying header integrity.
1925  */
1926 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1927 {
1928 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1929 	struct packet_type *ptype;
1930 	__be16 type = skb->protocol;
1931 	int vlan_depth = ETH_HLEN;
1932 	int err;
1933 
1934 	while (type == htons(ETH_P_8021Q)) {
1935 		struct vlan_hdr *vh;
1936 
1937 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1938 			return ERR_PTR(-EINVAL);
1939 
1940 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1941 		type = vh->h_vlan_encapsulated_proto;
1942 		vlan_depth += VLAN_HLEN;
1943 	}
1944 
1945 	skb_reset_mac_header(skb);
1946 	skb->mac_len = skb->network_header - skb->mac_header;
1947 	__skb_pull(skb, skb->mac_len);
1948 
1949 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1950 		struct net_device *dev = skb->dev;
1951 		struct ethtool_drvinfo info = {};
1952 
1953 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1954 			dev->ethtool_ops->get_drvinfo(dev, &info);
1955 
1956 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1957 		     info.driver, dev ? dev->features : 0L,
1958 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1959 		     skb->len, skb->data_len, skb->ip_summed);
1960 
1961 		if (skb_header_cloned(skb) &&
1962 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1963 			return ERR_PTR(err);
1964 	}
1965 
1966 	rcu_read_lock();
1967 	list_for_each_entry_rcu(ptype,
1968 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1969 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1970 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1971 				err = ptype->gso_send_check(skb);
1972 				segs = ERR_PTR(err);
1973 				if (err || skb_gso_ok(skb, features))
1974 					break;
1975 				__skb_push(skb, (skb->data -
1976 						 skb_network_header(skb)));
1977 			}
1978 			segs = ptype->gso_segment(skb, features);
1979 			break;
1980 		}
1981 	}
1982 	rcu_read_unlock();
1983 
1984 	__skb_push(skb, skb->data - skb_mac_header(skb));
1985 
1986 	return segs;
1987 }
1988 EXPORT_SYMBOL(skb_gso_segment);
1989 
1990 /* Take action when hardware reception checksum errors are detected. */
1991 #ifdef CONFIG_BUG
1992 void netdev_rx_csum_fault(struct net_device *dev)
1993 {
1994 	if (net_ratelimit()) {
1995 		printk(KERN_ERR "%s: hw csum failure.\n",
1996 			dev ? dev->name : "<unknown>");
1997 		dump_stack();
1998 	}
1999 }
2000 EXPORT_SYMBOL(netdev_rx_csum_fault);
2001 #endif
2002 
2003 /* Actually, we should eliminate this check as soon as we know, that:
2004  * 1. IOMMU is present and allows to map all the memory.
2005  * 2. No high memory really exists on this machine.
2006  */
2007 
2008 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2009 {
2010 #ifdef CONFIG_HIGHMEM
2011 	int i;
2012 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2013 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2014 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2015 			if (PageHighMem(skb_frag_page(frag)))
2016 				return 1;
2017 		}
2018 	}
2019 
2020 	if (PCI_DMA_BUS_IS_PHYS) {
2021 		struct device *pdev = dev->dev.parent;
2022 
2023 		if (!pdev)
2024 			return 0;
2025 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2026 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2027 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2028 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2029 				return 1;
2030 		}
2031 	}
2032 #endif
2033 	return 0;
2034 }
2035 
2036 struct dev_gso_cb {
2037 	void (*destructor)(struct sk_buff *skb);
2038 };
2039 
2040 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2041 
2042 static void dev_gso_skb_destructor(struct sk_buff *skb)
2043 {
2044 	struct dev_gso_cb *cb;
2045 
2046 	do {
2047 		struct sk_buff *nskb = skb->next;
2048 
2049 		skb->next = nskb->next;
2050 		nskb->next = NULL;
2051 		kfree_skb(nskb);
2052 	} while (skb->next);
2053 
2054 	cb = DEV_GSO_CB(skb);
2055 	if (cb->destructor)
2056 		cb->destructor(skb);
2057 }
2058 
2059 /**
2060  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2061  *	@skb: buffer to segment
2062  *	@features: device features as applicable to this skb
2063  *
2064  *	This function segments the given skb and stores the list of segments
2065  *	in skb->next.
2066  */
2067 static int dev_gso_segment(struct sk_buff *skb, int features)
2068 {
2069 	struct sk_buff *segs;
2070 
2071 	segs = skb_gso_segment(skb, features);
2072 
2073 	/* Verifying header integrity only. */
2074 	if (!segs)
2075 		return 0;
2076 
2077 	if (IS_ERR(segs))
2078 		return PTR_ERR(segs);
2079 
2080 	skb->next = segs;
2081 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2082 	skb->destructor = dev_gso_skb_destructor;
2083 
2084 	return 0;
2085 }
2086 
2087 /*
2088  * Try to orphan skb early, right before transmission by the device.
2089  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2090  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2091  */
2092 static inline void skb_orphan_try(struct sk_buff *skb)
2093 {
2094 	struct sock *sk = skb->sk;
2095 
2096 	if (sk && !skb_shinfo(skb)->tx_flags) {
2097 		/* skb_tx_hash() wont be able to get sk.
2098 		 * We copy sk_hash into skb->rxhash
2099 		 */
2100 		if (!skb->rxhash)
2101 			skb->rxhash = sk->sk_hash;
2102 		skb_orphan(skb);
2103 	}
2104 }
2105 
2106 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2107 {
2108 	return ((features & NETIF_F_GEN_CSUM) ||
2109 		((features & NETIF_F_V4_CSUM) &&
2110 		 protocol == htons(ETH_P_IP)) ||
2111 		((features & NETIF_F_V6_CSUM) &&
2112 		 protocol == htons(ETH_P_IPV6)) ||
2113 		((features & NETIF_F_FCOE_CRC) &&
2114 		 protocol == htons(ETH_P_FCOE)));
2115 }
2116 
2117 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2118 {
2119 	if (!can_checksum_protocol(features, protocol)) {
2120 		features &= ~NETIF_F_ALL_CSUM;
2121 		features &= ~NETIF_F_SG;
2122 	} else if (illegal_highdma(skb->dev, skb)) {
2123 		features &= ~NETIF_F_SG;
2124 	}
2125 
2126 	return features;
2127 }
2128 
2129 u32 netif_skb_features(struct sk_buff *skb)
2130 {
2131 	__be16 protocol = skb->protocol;
2132 	u32 features = skb->dev->features;
2133 
2134 	if (protocol == htons(ETH_P_8021Q)) {
2135 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2136 		protocol = veh->h_vlan_encapsulated_proto;
2137 	} else if (!vlan_tx_tag_present(skb)) {
2138 		return harmonize_features(skb, protocol, features);
2139 	}
2140 
2141 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2142 
2143 	if (protocol != htons(ETH_P_8021Q)) {
2144 		return harmonize_features(skb, protocol, features);
2145 	} else {
2146 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2147 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2148 		return harmonize_features(skb, protocol, features);
2149 	}
2150 }
2151 EXPORT_SYMBOL(netif_skb_features);
2152 
2153 /*
2154  * Returns true if either:
2155  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2156  *	2. skb is fragmented and the device does not support SG, or if
2157  *	   at least one of fragments is in highmem and device does not
2158  *	   support DMA from it.
2159  */
2160 static inline int skb_needs_linearize(struct sk_buff *skb,
2161 				      int features)
2162 {
2163 	return skb_is_nonlinear(skb) &&
2164 			((skb_has_frag_list(skb) &&
2165 				!(features & NETIF_F_FRAGLIST)) ||
2166 			(skb_shinfo(skb)->nr_frags &&
2167 				!(features & NETIF_F_SG)));
2168 }
2169 
2170 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2171 			struct netdev_queue *txq)
2172 {
2173 	const struct net_device_ops *ops = dev->netdev_ops;
2174 	int rc = NETDEV_TX_OK;
2175 	unsigned int skb_len;
2176 
2177 	if (likely(!skb->next)) {
2178 		u32 features;
2179 
2180 		/*
2181 		 * If device doesn't need skb->dst, release it right now while
2182 		 * its hot in this cpu cache
2183 		 */
2184 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2185 			skb_dst_drop(skb);
2186 
2187 		if (!list_empty(&ptype_all))
2188 			dev_queue_xmit_nit(skb, dev);
2189 
2190 		skb_orphan_try(skb);
2191 
2192 		features = netif_skb_features(skb);
2193 
2194 		if (vlan_tx_tag_present(skb) &&
2195 		    !(features & NETIF_F_HW_VLAN_TX)) {
2196 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2197 			if (unlikely(!skb))
2198 				goto out;
2199 
2200 			skb->vlan_tci = 0;
2201 		}
2202 
2203 		if (netif_needs_gso(skb, features)) {
2204 			if (unlikely(dev_gso_segment(skb, features)))
2205 				goto out_kfree_skb;
2206 			if (skb->next)
2207 				goto gso;
2208 		} else {
2209 			if (skb_needs_linearize(skb, features) &&
2210 			    __skb_linearize(skb))
2211 				goto out_kfree_skb;
2212 
2213 			/* If packet is not checksummed and device does not
2214 			 * support checksumming for this protocol, complete
2215 			 * checksumming here.
2216 			 */
2217 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2218 				skb_set_transport_header(skb,
2219 					skb_checksum_start_offset(skb));
2220 				if (!(features & NETIF_F_ALL_CSUM) &&
2221 				     skb_checksum_help(skb))
2222 					goto out_kfree_skb;
2223 			}
2224 		}
2225 
2226 		skb_len = skb->len;
2227 		rc = ops->ndo_start_xmit(skb, dev);
2228 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2229 		if (rc == NETDEV_TX_OK)
2230 			txq_trans_update(txq);
2231 		return rc;
2232 	}
2233 
2234 gso:
2235 	do {
2236 		struct sk_buff *nskb = skb->next;
2237 
2238 		skb->next = nskb->next;
2239 		nskb->next = NULL;
2240 
2241 		/*
2242 		 * If device doesn't need nskb->dst, release it right now while
2243 		 * its hot in this cpu cache
2244 		 */
2245 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2246 			skb_dst_drop(nskb);
2247 
2248 		skb_len = nskb->len;
2249 		rc = ops->ndo_start_xmit(nskb, dev);
2250 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2251 		if (unlikely(rc != NETDEV_TX_OK)) {
2252 			if (rc & ~NETDEV_TX_MASK)
2253 				goto out_kfree_gso_skb;
2254 			nskb->next = skb->next;
2255 			skb->next = nskb;
2256 			return rc;
2257 		}
2258 		txq_trans_update(txq);
2259 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2260 			return NETDEV_TX_BUSY;
2261 	} while (skb->next);
2262 
2263 out_kfree_gso_skb:
2264 	if (likely(skb->next == NULL))
2265 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2266 out_kfree_skb:
2267 	kfree_skb(skb);
2268 out:
2269 	return rc;
2270 }
2271 
2272 static u32 hashrnd __read_mostly;
2273 
2274 /*
2275  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2276  * to be used as a distribution range.
2277  */
2278 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2279 		  unsigned int num_tx_queues)
2280 {
2281 	u32 hash;
2282 	u16 qoffset = 0;
2283 	u16 qcount = num_tx_queues;
2284 
2285 	if (skb_rx_queue_recorded(skb)) {
2286 		hash = skb_get_rx_queue(skb);
2287 		while (unlikely(hash >= num_tx_queues))
2288 			hash -= num_tx_queues;
2289 		return hash;
2290 	}
2291 
2292 	if (dev->num_tc) {
2293 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2294 		qoffset = dev->tc_to_txq[tc].offset;
2295 		qcount = dev->tc_to_txq[tc].count;
2296 	}
2297 
2298 	if (skb->sk && skb->sk->sk_hash)
2299 		hash = skb->sk->sk_hash;
2300 	else
2301 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2302 	hash = jhash_1word(hash, hashrnd);
2303 
2304 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2305 }
2306 EXPORT_SYMBOL(__skb_tx_hash);
2307 
2308 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2309 {
2310 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2311 		if (net_ratelimit()) {
2312 			pr_warning("%s selects TX queue %d, but "
2313 				"real number of TX queues is %d\n",
2314 				dev->name, queue_index, dev->real_num_tx_queues);
2315 		}
2316 		return 0;
2317 	}
2318 	return queue_index;
2319 }
2320 
2321 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2322 {
2323 #ifdef CONFIG_XPS
2324 	struct xps_dev_maps *dev_maps;
2325 	struct xps_map *map;
2326 	int queue_index = -1;
2327 
2328 	rcu_read_lock();
2329 	dev_maps = rcu_dereference(dev->xps_maps);
2330 	if (dev_maps) {
2331 		map = rcu_dereference(
2332 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2333 		if (map) {
2334 			if (map->len == 1)
2335 				queue_index = map->queues[0];
2336 			else {
2337 				u32 hash;
2338 				if (skb->sk && skb->sk->sk_hash)
2339 					hash = skb->sk->sk_hash;
2340 				else
2341 					hash = (__force u16) skb->protocol ^
2342 					    skb->rxhash;
2343 				hash = jhash_1word(hash, hashrnd);
2344 				queue_index = map->queues[
2345 				    ((u64)hash * map->len) >> 32];
2346 			}
2347 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2348 				queue_index = -1;
2349 		}
2350 	}
2351 	rcu_read_unlock();
2352 
2353 	return queue_index;
2354 #else
2355 	return -1;
2356 #endif
2357 }
2358 
2359 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2360 					struct sk_buff *skb)
2361 {
2362 	int queue_index;
2363 	const struct net_device_ops *ops = dev->netdev_ops;
2364 
2365 	if (dev->real_num_tx_queues == 1)
2366 		queue_index = 0;
2367 	else if (ops->ndo_select_queue) {
2368 		queue_index = ops->ndo_select_queue(dev, skb);
2369 		queue_index = dev_cap_txqueue(dev, queue_index);
2370 	} else {
2371 		struct sock *sk = skb->sk;
2372 		queue_index = sk_tx_queue_get(sk);
2373 
2374 		if (queue_index < 0 || skb->ooo_okay ||
2375 		    queue_index >= dev->real_num_tx_queues) {
2376 			int old_index = queue_index;
2377 
2378 			queue_index = get_xps_queue(dev, skb);
2379 			if (queue_index < 0)
2380 				queue_index = skb_tx_hash(dev, skb);
2381 
2382 			if (queue_index != old_index && sk) {
2383 				struct dst_entry *dst =
2384 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2385 
2386 				if (dst && skb_dst(skb) == dst)
2387 					sk_tx_queue_set(sk, queue_index);
2388 			}
2389 		}
2390 	}
2391 
2392 	skb_set_queue_mapping(skb, queue_index);
2393 	return netdev_get_tx_queue(dev, queue_index);
2394 }
2395 
2396 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2397 				 struct net_device *dev,
2398 				 struct netdev_queue *txq)
2399 {
2400 	spinlock_t *root_lock = qdisc_lock(q);
2401 	bool contended;
2402 	int rc;
2403 
2404 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2405 	qdisc_calculate_pkt_len(skb, q);
2406 	/*
2407 	 * Heuristic to force contended enqueues to serialize on a
2408 	 * separate lock before trying to get qdisc main lock.
2409 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2410 	 * and dequeue packets faster.
2411 	 */
2412 	contended = qdisc_is_running(q);
2413 	if (unlikely(contended))
2414 		spin_lock(&q->busylock);
2415 
2416 	spin_lock(root_lock);
2417 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2418 		kfree_skb(skb);
2419 		rc = NET_XMIT_DROP;
2420 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2421 		   qdisc_run_begin(q)) {
2422 		/*
2423 		 * This is a work-conserving queue; there are no old skbs
2424 		 * waiting to be sent out; and the qdisc is not running -
2425 		 * xmit the skb directly.
2426 		 */
2427 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2428 			skb_dst_force(skb);
2429 
2430 		qdisc_bstats_update(q, skb);
2431 
2432 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2433 			if (unlikely(contended)) {
2434 				spin_unlock(&q->busylock);
2435 				contended = false;
2436 			}
2437 			__qdisc_run(q);
2438 		} else
2439 			qdisc_run_end(q);
2440 
2441 		rc = NET_XMIT_SUCCESS;
2442 	} else {
2443 		skb_dst_force(skb);
2444 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2445 		if (qdisc_run_begin(q)) {
2446 			if (unlikely(contended)) {
2447 				spin_unlock(&q->busylock);
2448 				contended = false;
2449 			}
2450 			__qdisc_run(q);
2451 		}
2452 	}
2453 	spin_unlock(root_lock);
2454 	if (unlikely(contended))
2455 		spin_unlock(&q->busylock);
2456 	return rc;
2457 }
2458 
2459 static DEFINE_PER_CPU(int, xmit_recursion);
2460 #define RECURSION_LIMIT 10
2461 
2462 /**
2463  *	dev_queue_xmit - transmit a buffer
2464  *	@skb: buffer to transmit
2465  *
2466  *	Queue a buffer for transmission to a network device. The caller must
2467  *	have set the device and priority and built the buffer before calling
2468  *	this function. The function can be called from an interrupt.
2469  *
2470  *	A negative errno code is returned on a failure. A success does not
2471  *	guarantee the frame will be transmitted as it may be dropped due
2472  *	to congestion or traffic shaping.
2473  *
2474  * -----------------------------------------------------------------------------------
2475  *      I notice this method can also return errors from the queue disciplines,
2476  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2477  *      be positive.
2478  *
2479  *      Regardless of the return value, the skb is consumed, so it is currently
2480  *      difficult to retry a send to this method.  (You can bump the ref count
2481  *      before sending to hold a reference for retry if you are careful.)
2482  *
2483  *      When calling this method, interrupts MUST be enabled.  This is because
2484  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2485  *          --BLG
2486  */
2487 int dev_queue_xmit(struct sk_buff *skb)
2488 {
2489 	struct net_device *dev = skb->dev;
2490 	struct netdev_queue *txq;
2491 	struct Qdisc *q;
2492 	int rc = -ENOMEM;
2493 
2494 	/* Disable soft irqs for various locks below. Also
2495 	 * stops preemption for RCU.
2496 	 */
2497 	rcu_read_lock_bh();
2498 
2499 	txq = dev_pick_tx(dev, skb);
2500 	q = rcu_dereference_bh(txq->qdisc);
2501 
2502 #ifdef CONFIG_NET_CLS_ACT
2503 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2504 #endif
2505 	trace_net_dev_queue(skb);
2506 	if (q->enqueue) {
2507 		rc = __dev_xmit_skb(skb, q, dev, txq);
2508 		goto out;
2509 	}
2510 
2511 	/* The device has no queue. Common case for software devices:
2512 	   loopback, all the sorts of tunnels...
2513 
2514 	   Really, it is unlikely that netif_tx_lock protection is necessary
2515 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2516 	   counters.)
2517 	   However, it is possible, that they rely on protection
2518 	   made by us here.
2519 
2520 	   Check this and shot the lock. It is not prone from deadlocks.
2521 	   Either shot noqueue qdisc, it is even simpler 8)
2522 	 */
2523 	if (dev->flags & IFF_UP) {
2524 		int cpu = smp_processor_id(); /* ok because BHs are off */
2525 
2526 		if (txq->xmit_lock_owner != cpu) {
2527 
2528 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2529 				goto recursion_alert;
2530 
2531 			HARD_TX_LOCK(dev, txq, cpu);
2532 
2533 			if (!netif_tx_queue_stopped(txq)) {
2534 				__this_cpu_inc(xmit_recursion);
2535 				rc = dev_hard_start_xmit(skb, dev, txq);
2536 				__this_cpu_dec(xmit_recursion);
2537 				if (dev_xmit_complete(rc)) {
2538 					HARD_TX_UNLOCK(dev, txq);
2539 					goto out;
2540 				}
2541 			}
2542 			HARD_TX_UNLOCK(dev, txq);
2543 			if (net_ratelimit())
2544 				printk(KERN_CRIT "Virtual device %s asks to "
2545 				       "queue packet!\n", dev->name);
2546 		} else {
2547 			/* Recursion is detected! It is possible,
2548 			 * unfortunately
2549 			 */
2550 recursion_alert:
2551 			if (net_ratelimit())
2552 				printk(KERN_CRIT "Dead loop on virtual device "
2553 				       "%s, fix it urgently!\n", dev->name);
2554 		}
2555 	}
2556 
2557 	rc = -ENETDOWN;
2558 	rcu_read_unlock_bh();
2559 
2560 	kfree_skb(skb);
2561 	return rc;
2562 out:
2563 	rcu_read_unlock_bh();
2564 	return rc;
2565 }
2566 EXPORT_SYMBOL(dev_queue_xmit);
2567 
2568 
2569 /*=======================================================================
2570 			Receiver routines
2571   =======================================================================*/
2572 
2573 int netdev_max_backlog __read_mostly = 1000;
2574 int netdev_tstamp_prequeue __read_mostly = 1;
2575 int netdev_budget __read_mostly = 300;
2576 int weight_p __read_mostly = 64;            /* old backlog weight */
2577 
2578 /* Called with irq disabled */
2579 static inline void ____napi_schedule(struct softnet_data *sd,
2580 				     struct napi_struct *napi)
2581 {
2582 	list_add_tail(&napi->poll_list, &sd->poll_list);
2583 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2584 }
2585 
2586 /*
2587  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2588  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2589  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2590  * if hash is a canonical 4-tuple hash over transport ports.
2591  */
2592 void __skb_get_rxhash(struct sk_buff *skb)
2593 {
2594 	int nhoff, hash = 0, poff;
2595 	const struct ipv6hdr *ip6;
2596 	const struct iphdr *ip;
2597 	const struct vlan_hdr *vlan;
2598 	u8 ip_proto;
2599 	u32 addr1, addr2;
2600 	u16 proto;
2601 	union {
2602 		u32 v32;
2603 		u16 v16[2];
2604 	} ports;
2605 
2606 	nhoff = skb_network_offset(skb);
2607 	proto = skb->protocol;
2608 
2609 again:
2610 	switch (proto) {
2611 	case __constant_htons(ETH_P_IP):
2612 ip:
2613 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2614 			goto done;
2615 
2616 		ip = (const struct iphdr *) (skb->data + nhoff);
2617 		if (ip_is_fragment(ip))
2618 			ip_proto = 0;
2619 		else
2620 			ip_proto = ip->protocol;
2621 		addr1 = (__force u32) ip->saddr;
2622 		addr2 = (__force u32) ip->daddr;
2623 		nhoff += ip->ihl * 4;
2624 		break;
2625 	case __constant_htons(ETH_P_IPV6):
2626 ipv6:
2627 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2628 			goto done;
2629 
2630 		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2631 		ip_proto = ip6->nexthdr;
2632 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2633 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2634 		nhoff += 40;
2635 		break;
2636 	case __constant_htons(ETH_P_8021Q):
2637 		if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2638 			goto done;
2639 		vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2640 		proto = vlan->h_vlan_encapsulated_proto;
2641 		nhoff += sizeof(*vlan);
2642 		goto again;
2643 	case __constant_htons(ETH_P_PPP_SES):
2644 		if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2645 			goto done;
2646 		proto = *((__be16 *) (skb->data + nhoff +
2647 				      sizeof(struct pppoe_hdr)));
2648 		nhoff += PPPOE_SES_HLEN;
2649 		switch (proto) {
2650 		case __constant_htons(PPP_IP):
2651 			goto ip;
2652 		case __constant_htons(PPP_IPV6):
2653 			goto ipv6;
2654 		default:
2655 			goto done;
2656 		}
2657 	default:
2658 		goto done;
2659 	}
2660 
2661 	switch (ip_proto) {
2662 	case IPPROTO_GRE:
2663 		if (pskb_may_pull(skb, nhoff + 16)) {
2664 			u8 *h = skb->data + nhoff;
2665 			__be16 flags = *(__be16 *)h;
2666 
2667 			/*
2668 			 * Only look inside GRE if version zero and no
2669 			 * routing
2670 			 */
2671 			if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2672 				proto = *(__be16 *)(h + 2);
2673 				nhoff += 4;
2674 				if (flags & GRE_CSUM)
2675 					nhoff += 4;
2676 				if (flags & GRE_KEY)
2677 					nhoff += 4;
2678 				if (flags & GRE_SEQ)
2679 					nhoff += 4;
2680 				goto again;
2681 			}
2682 		}
2683 		break;
2684 	case IPPROTO_IPIP:
2685 		goto again;
2686 	default:
2687 		break;
2688 	}
2689 
2690 	ports.v32 = 0;
2691 	poff = proto_ports_offset(ip_proto);
2692 	if (poff >= 0) {
2693 		nhoff += poff;
2694 		if (pskb_may_pull(skb, nhoff + 4)) {
2695 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2696 			if (ports.v16[1] < ports.v16[0])
2697 				swap(ports.v16[0], ports.v16[1]);
2698 			skb->l4_rxhash = 1;
2699 		}
2700 	}
2701 
2702 	/* get a consistent hash (same value on both flow directions) */
2703 	if (addr2 < addr1)
2704 		swap(addr1, addr2);
2705 
2706 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2707 	if (!hash)
2708 		hash = 1;
2709 
2710 done:
2711 	skb->rxhash = hash;
2712 }
2713 EXPORT_SYMBOL(__skb_get_rxhash);
2714 
2715 #ifdef CONFIG_RPS
2716 
2717 /* One global table that all flow-based protocols share. */
2718 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2719 EXPORT_SYMBOL(rps_sock_flow_table);
2720 
2721 static struct rps_dev_flow *
2722 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2723 	    struct rps_dev_flow *rflow, u16 next_cpu)
2724 {
2725 	if (next_cpu != RPS_NO_CPU) {
2726 #ifdef CONFIG_RFS_ACCEL
2727 		struct netdev_rx_queue *rxqueue;
2728 		struct rps_dev_flow_table *flow_table;
2729 		struct rps_dev_flow *old_rflow;
2730 		u32 flow_id;
2731 		u16 rxq_index;
2732 		int rc;
2733 
2734 		/* Should we steer this flow to a different hardware queue? */
2735 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2736 		    !(dev->features & NETIF_F_NTUPLE))
2737 			goto out;
2738 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2739 		if (rxq_index == skb_get_rx_queue(skb))
2740 			goto out;
2741 
2742 		rxqueue = dev->_rx + rxq_index;
2743 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2744 		if (!flow_table)
2745 			goto out;
2746 		flow_id = skb->rxhash & flow_table->mask;
2747 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2748 							rxq_index, flow_id);
2749 		if (rc < 0)
2750 			goto out;
2751 		old_rflow = rflow;
2752 		rflow = &flow_table->flows[flow_id];
2753 		rflow->filter = rc;
2754 		if (old_rflow->filter == rflow->filter)
2755 			old_rflow->filter = RPS_NO_FILTER;
2756 	out:
2757 #endif
2758 		rflow->last_qtail =
2759 			per_cpu(softnet_data, next_cpu).input_queue_head;
2760 	}
2761 
2762 	rflow->cpu = next_cpu;
2763 	return rflow;
2764 }
2765 
2766 /*
2767  * get_rps_cpu is called from netif_receive_skb and returns the target
2768  * CPU from the RPS map of the receiving queue for a given skb.
2769  * rcu_read_lock must be held on entry.
2770  */
2771 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2772 		       struct rps_dev_flow **rflowp)
2773 {
2774 	struct netdev_rx_queue *rxqueue;
2775 	struct rps_map *map;
2776 	struct rps_dev_flow_table *flow_table;
2777 	struct rps_sock_flow_table *sock_flow_table;
2778 	int cpu = -1;
2779 	u16 tcpu;
2780 
2781 	if (skb_rx_queue_recorded(skb)) {
2782 		u16 index = skb_get_rx_queue(skb);
2783 		if (unlikely(index >= dev->real_num_rx_queues)) {
2784 			WARN_ONCE(dev->real_num_rx_queues > 1,
2785 				  "%s received packet on queue %u, but number "
2786 				  "of RX queues is %u\n",
2787 				  dev->name, index, dev->real_num_rx_queues);
2788 			goto done;
2789 		}
2790 		rxqueue = dev->_rx + index;
2791 	} else
2792 		rxqueue = dev->_rx;
2793 
2794 	map = rcu_dereference(rxqueue->rps_map);
2795 	if (map) {
2796 		if (map->len == 1 &&
2797 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2798 			tcpu = map->cpus[0];
2799 			if (cpu_online(tcpu))
2800 				cpu = tcpu;
2801 			goto done;
2802 		}
2803 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2804 		goto done;
2805 	}
2806 
2807 	skb_reset_network_header(skb);
2808 	if (!skb_get_rxhash(skb))
2809 		goto done;
2810 
2811 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2812 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2813 	if (flow_table && sock_flow_table) {
2814 		u16 next_cpu;
2815 		struct rps_dev_flow *rflow;
2816 
2817 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2818 		tcpu = rflow->cpu;
2819 
2820 		next_cpu = sock_flow_table->ents[skb->rxhash &
2821 		    sock_flow_table->mask];
2822 
2823 		/*
2824 		 * If the desired CPU (where last recvmsg was done) is
2825 		 * different from current CPU (one in the rx-queue flow
2826 		 * table entry), switch if one of the following holds:
2827 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2828 		 *   - Current CPU is offline.
2829 		 *   - The current CPU's queue tail has advanced beyond the
2830 		 *     last packet that was enqueued using this table entry.
2831 		 *     This guarantees that all previous packets for the flow
2832 		 *     have been dequeued, thus preserving in order delivery.
2833 		 */
2834 		if (unlikely(tcpu != next_cpu) &&
2835 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2836 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2837 		      rflow->last_qtail)) >= 0))
2838 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2839 
2840 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2841 			*rflowp = rflow;
2842 			cpu = tcpu;
2843 			goto done;
2844 		}
2845 	}
2846 
2847 	if (map) {
2848 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2849 
2850 		if (cpu_online(tcpu)) {
2851 			cpu = tcpu;
2852 			goto done;
2853 		}
2854 	}
2855 
2856 done:
2857 	return cpu;
2858 }
2859 
2860 #ifdef CONFIG_RFS_ACCEL
2861 
2862 /**
2863  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2864  * @dev: Device on which the filter was set
2865  * @rxq_index: RX queue index
2866  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2867  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2868  *
2869  * Drivers that implement ndo_rx_flow_steer() should periodically call
2870  * this function for each installed filter and remove the filters for
2871  * which it returns %true.
2872  */
2873 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2874 			 u32 flow_id, u16 filter_id)
2875 {
2876 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2877 	struct rps_dev_flow_table *flow_table;
2878 	struct rps_dev_flow *rflow;
2879 	bool expire = true;
2880 	int cpu;
2881 
2882 	rcu_read_lock();
2883 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2884 	if (flow_table && flow_id <= flow_table->mask) {
2885 		rflow = &flow_table->flows[flow_id];
2886 		cpu = ACCESS_ONCE(rflow->cpu);
2887 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2888 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2889 			   rflow->last_qtail) <
2890 		     (int)(10 * flow_table->mask)))
2891 			expire = false;
2892 	}
2893 	rcu_read_unlock();
2894 	return expire;
2895 }
2896 EXPORT_SYMBOL(rps_may_expire_flow);
2897 
2898 #endif /* CONFIG_RFS_ACCEL */
2899 
2900 /* Called from hardirq (IPI) context */
2901 static void rps_trigger_softirq(void *data)
2902 {
2903 	struct softnet_data *sd = data;
2904 
2905 	____napi_schedule(sd, &sd->backlog);
2906 	sd->received_rps++;
2907 }
2908 
2909 #endif /* CONFIG_RPS */
2910 
2911 /*
2912  * Check if this softnet_data structure is another cpu one
2913  * If yes, queue it to our IPI list and return 1
2914  * If no, return 0
2915  */
2916 static int rps_ipi_queued(struct softnet_data *sd)
2917 {
2918 #ifdef CONFIG_RPS
2919 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2920 
2921 	if (sd != mysd) {
2922 		sd->rps_ipi_next = mysd->rps_ipi_list;
2923 		mysd->rps_ipi_list = sd;
2924 
2925 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2926 		return 1;
2927 	}
2928 #endif /* CONFIG_RPS */
2929 	return 0;
2930 }
2931 
2932 /*
2933  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2934  * queue (may be a remote CPU queue).
2935  */
2936 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2937 			      unsigned int *qtail)
2938 {
2939 	struct softnet_data *sd;
2940 	unsigned long flags;
2941 
2942 	sd = &per_cpu(softnet_data, cpu);
2943 
2944 	local_irq_save(flags);
2945 
2946 	rps_lock(sd);
2947 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2948 		if (skb_queue_len(&sd->input_pkt_queue)) {
2949 enqueue:
2950 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2951 			input_queue_tail_incr_save(sd, qtail);
2952 			rps_unlock(sd);
2953 			local_irq_restore(flags);
2954 			return NET_RX_SUCCESS;
2955 		}
2956 
2957 		/* Schedule NAPI for backlog device
2958 		 * We can use non atomic operation since we own the queue lock
2959 		 */
2960 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2961 			if (!rps_ipi_queued(sd))
2962 				____napi_schedule(sd, &sd->backlog);
2963 		}
2964 		goto enqueue;
2965 	}
2966 
2967 	sd->dropped++;
2968 	rps_unlock(sd);
2969 
2970 	local_irq_restore(flags);
2971 
2972 	atomic_long_inc(&skb->dev->rx_dropped);
2973 	kfree_skb(skb);
2974 	return NET_RX_DROP;
2975 }
2976 
2977 /**
2978  *	netif_rx	-	post buffer to the network code
2979  *	@skb: buffer to post
2980  *
2981  *	This function receives a packet from a device driver and queues it for
2982  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2983  *	may be dropped during processing for congestion control or by the
2984  *	protocol layers.
2985  *
2986  *	return values:
2987  *	NET_RX_SUCCESS	(no congestion)
2988  *	NET_RX_DROP     (packet was dropped)
2989  *
2990  */
2991 
2992 int netif_rx(struct sk_buff *skb)
2993 {
2994 	int ret;
2995 
2996 	/* if netpoll wants it, pretend we never saw it */
2997 	if (netpoll_rx(skb))
2998 		return NET_RX_DROP;
2999 
3000 	if (netdev_tstamp_prequeue)
3001 		net_timestamp_check(skb);
3002 
3003 	trace_netif_rx(skb);
3004 #ifdef CONFIG_RPS
3005 	{
3006 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3007 		int cpu;
3008 
3009 		preempt_disable();
3010 		rcu_read_lock();
3011 
3012 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3013 		if (cpu < 0)
3014 			cpu = smp_processor_id();
3015 
3016 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3017 
3018 		rcu_read_unlock();
3019 		preempt_enable();
3020 	}
3021 #else
3022 	{
3023 		unsigned int qtail;
3024 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3025 		put_cpu();
3026 	}
3027 #endif
3028 	return ret;
3029 }
3030 EXPORT_SYMBOL(netif_rx);
3031 
3032 int netif_rx_ni(struct sk_buff *skb)
3033 {
3034 	int err;
3035 
3036 	preempt_disable();
3037 	err = netif_rx(skb);
3038 	if (local_softirq_pending())
3039 		do_softirq();
3040 	preempt_enable();
3041 
3042 	return err;
3043 }
3044 EXPORT_SYMBOL(netif_rx_ni);
3045 
3046 static void net_tx_action(struct softirq_action *h)
3047 {
3048 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3049 
3050 	if (sd->completion_queue) {
3051 		struct sk_buff *clist;
3052 
3053 		local_irq_disable();
3054 		clist = sd->completion_queue;
3055 		sd->completion_queue = NULL;
3056 		local_irq_enable();
3057 
3058 		while (clist) {
3059 			struct sk_buff *skb = clist;
3060 			clist = clist->next;
3061 
3062 			WARN_ON(atomic_read(&skb->users));
3063 			trace_kfree_skb(skb, net_tx_action);
3064 			__kfree_skb(skb);
3065 		}
3066 	}
3067 
3068 	if (sd->output_queue) {
3069 		struct Qdisc *head;
3070 
3071 		local_irq_disable();
3072 		head = sd->output_queue;
3073 		sd->output_queue = NULL;
3074 		sd->output_queue_tailp = &sd->output_queue;
3075 		local_irq_enable();
3076 
3077 		while (head) {
3078 			struct Qdisc *q = head;
3079 			spinlock_t *root_lock;
3080 
3081 			head = head->next_sched;
3082 
3083 			root_lock = qdisc_lock(q);
3084 			if (spin_trylock(root_lock)) {
3085 				smp_mb__before_clear_bit();
3086 				clear_bit(__QDISC_STATE_SCHED,
3087 					  &q->state);
3088 				qdisc_run(q);
3089 				spin_unlock(root_lock);
3090 			} else {
3091 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3092 					      &q->state)) {
3093 					__netif_reschedule(q);
3094 				} else {
3095 					smp_mb__before_clear_bit();
3096 					clear_bit(__QDISC_STATE_SCHED,
3097 						  &q->state);
3098 				}
3099 			}
3100 		}
3101 	}
3102 }
3103 
3104 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3105     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3106 /* This hook is defined here for ATM LANE */
3107 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3108 			     unsigned char *addr) __read_mostly;
3109 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3110 #endif
3111 
3112 #ifdef CONFIG_NET_CLS_ACT
3113 /* TODO: Maybe we should just force sch_ingress to be compiled in
3114  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3115  * a compare and 2 stores extra right now if we dont have it on
3116  * but have CONFIG_NET_CLS_ACT
3117  * NOTE: This doesn't stop any functionality; if you dont have
3118  * the ingress scheduler, you just can't add policies on ingress.
3119  *
3120  */
3121 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3122 {
3123 	struct net_device *dev = skb->dev;
3124 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3125 	int result = TC_ACT_OK;
3126 	struct Qdisc *q;
3127 
3128 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3129 		if (net_ratelimit())
3130 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3131 			       skb->skb_iif, dev->ifindex);
3132 		return TC_ACT_SHOT;
3133 	}
3134 
3135 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3136 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3137 
3138 	q = rxq->qdisc;
3139 	if (q != &noop_qdisc) {
3140 		spin_lock(qdisc_lock(q));
3141 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3142 			result = qdisc_enqueue_root(skb, q);
3143 		spin_unlock(qdisc_lock(q));
3144 	}
3145 
3146 	return result;
3147 }
3148 
3149 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3150 					 struct packet_type **pt_prev,
3151 					 int *ret, struct net_device *orig_dev)
3152 {
3153 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3154 
3155 	if (!rxq || rxq->qdisc == &noop_qdisc)
3156 		goto out;
3157 
3158 	if (*pt_prev) {
3159 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3160 		*pt_prev = NULL;
3161 	}
3162 
3163 	switch (ing_filter(skb, rxq)) {
3164 	case TC_ACT_SHOT:
3165 	case TC_ACT_STOLEN:
3166 		kfree_skb(skb);
3167 		return NULL;
3168 	}
3169 
3170 out:
3171 	skb->tc_verd = 0;
3172 	return skb;
3173 }
3174 #endif
3175 
3176 /**
3177  *	netdev_rx_handler_register - register receive handler
3178  *	@dev: device to register a handler for
3179  *	@rx_handler: receive handler to register
3180  *	@rx_handler_data: data pointer that is used by rx handler
3181  *
3182  *	Register a receive hander for a device. This handler will then be
3183  *	called from __netif_receive_skb. A negative errno code is returned
3184  *	on a failure.
3185  *
3186  *	The caller must hold the rtnl_mutex.
3187  *
3188  *	For a general description of rx_handler, see enum rx_handler_result.
3189  */
3190 int netdev_rx_handler_register(struct net_device *dev,
3191 			       rx_handler_func_t *rx_handler,
3192 			       void *rx_handler_data)
3193 {
3194 	ASSERT_RTNL();
3195 
3196 	if (dev->rx_handler)
3197 		return -EBUSY;
3198 
3199 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3200 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3201 
3202 	return 0;
3203 }
3204 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3205 
3206 /**
3207  *	netdev_rx_handler_unregister - unregister receive handler
3208  *	@dev: device to unregister a handler from
3209  *
3210  *	Unregister a receive hander from a device.
3211  *
3212  *	The caller must hold the rtnl_mutex.
3213  */
3214 void netdev_rx_handler_unregister(struct net_device *dev)
3215 {
3216 
3217 	ASSERT_RTNL();
3218 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3219 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3220 }
3221 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3222 
3223 static int __netif_receive_skb(struct sk_buff *skb)
3224 {
3225 	struct packet_type *ptype, *pt_prev;
3226 	rx_handler_func_t *rx_handler;
3227 	struct net_device *orig_dev;
3228 	struct net_device *null_or_dev;
3229 	bool deliver_exact = false;
3230 	int ret = NET_RX_DROP;
3231 	__be16 type;
3232 
3233 	if (!netdev_tstamp_prequeue)
3234 		net_timestamp_check(skb);
3235 
3236 	trace_netif_receive_skb(skb);
3237 
3238 	/* if we've gotten here through NAPI, check netpoll */
3239 	if (netpoll_receive_skb(skb))
3240 		return NET_RX_DROP;
3241 
3242 	if (!skb->skb_iif)
3243 		skb->skb_iif = skb->dev->ifindex;
3244 	orig_dev = skb->dev;
3245 
3246 	skb_reset_network_header(skb);
3247 	skb_reset_transport_header(skb);
3248 	skb_reset_mac_len(skb);
3249 
3250 	pt_prev = NULL;
3251 
3252 	rcu_read_lock();
3253 
3254 another_round:
3255 
3256 	__this_cpu_inc(softnet_data.processed);
3257 
3258 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3259 		skb = vlan_untag(skb);
3260 		if (unlikely(!skb))
3261 			goto out;
3262 	}
3263 
3264 #ifdef CONFIG_NET_CLS_ACT
3265 	if (skb->tc_verd & TC_NCLS) {
3266 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3267 		goto ncls;
3268 	}
3269 #endif
3270 
3271 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3272 		if (!ptype->dev || ptype->dev == skb->dev) {
3273 			if (pt_prev)
3274 				ret = deliver_skb(skb, pt_prev, orig_dev);
3275 			pt_prev = ptype;
3276 		}
3277 	}
3278 
3279 #ifdef CONFIG_NET_CLS_ACT
3280 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3281 	if (!skb)
3282 		goto out;
3283 ncls:
3284 #endif
3285 
3286 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3287 	if (vlan_tx_tag_present(skb)) {
3288 		if (pt_prev) {
3289 			ret = deliver_skb(skb, pt_prev, orig_dev);
3290 			pt_prev = NULL;
3291 		}
3292 		if (vlan_do_receive(&skb, !rx_handler))
3293 			goto another_round;
3294 		else if (unlikely(!skb))
3295 			goto out;
3296 	}
3297 
3298 	if (rx_handler) {
3299 		if (pt_prev) {
3300 			ret = deliver_skb(skb, pt_prev, orig_dev);
3301 			pt_prev = NULL;
3302 		}
3303 		switch (rx_handler(&skb)) {
3304 		case RX_HANDLER_CONSUMED:
3305 			goto out;
3306 		case RX_HANDLER_ANOTHER:
3307 			goto another_round;
3308 		case RX_HANDLER_EXACT:
3309 			deliver_exact = true;
3310 		case RX_HANDLER_PASS:
3311 			break;
3312 		default:
3313 			BUG();
3314 		}
3315 	}
3316 
3317 	/* deliver only exact match when indicated */
3318 	null_or_dev = deliver_exact ? skb->dev : NULL;
3319 
3320 	type = skb->protocol;
3321 	list_for_each_entry_rcu(ptype,
3322 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3323 		if (ptype->type == type &&
3324 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3325 		     ptype->dev == orig_dev)) {
3326 			if (pt_prev)
3327 				ret = deliver_skb(skb, pt_prev, orig_dev);
3328 			pt_prev = ptype;
3329 		}
3330 	}
3331 
3332 	if (pt_prev) {
3333 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3334 	} else {
3335 		atomic_long_inc(&skb->dev->rx_dropped);
3336 		kfree_skb(skb);
3337 		/* Jamal, now you will not able to escape explaining
3338 		 * me how you were going to use this. :-)
3339 		 */
3340 		ret = NET_RX_DROP;
3341 	}
3342 
3343 out:
3344 	rcu_read_unlock();
3345 	return ret;
3346 }
3347 
3348 /**
3349  *	netif_receive_skb - process receive buffer from network
3350  *	@skb: buffer to process
3351  *
3352  *	netif_receive_skb() is the main receive data processing function.
3353  *	It always succeeds. The buffer may be dropped during processing
3354  *	for congestion control or by the protocol layers.
3355  *
3356  *	This function may only be called from softirq context and interrupts
3357  *	should be enabled.
3358  *
3359  *	Return values (usually ignored):
3360  *	NET_RX_SUCCESS: no congestion
3361  *	NET_RX_DROP: packet was dropped
3362  */
3363 int netif_receive_skb(struct sk_buff *skb)
3364 {
3365 	if (netdev_tstamp_prequeue)
3366 		net_timestamp_check(skb);
3367 
3368 	if (skb_defer_rx_timestamp(skb))
3369 		return NET_RX_SUCCESS;
3370 
3371 #ifdef CONFIG_RPS
3372 	{
3373 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3374 		int cpu, ret;
3375 
3376 		rcu_read_lock();
3377 
3378 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3379 
3380 		if (cpu >= 0) {
3381 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3382 			rcu_read_unlock();
3383 		} else {
3384 			rcu_read_unlock();
3385 			ret = __netif_receive_skb(skb);
3386 		}
3387 
3388 		return ret;
3389 	}
3390 #else
3391 	return __netif_receive_skb(skb);
3392 #endif
3393 }
3394 EXPORT_SYMBOL(netif_receive_skb);
3395 
3396 /* Network device is going away, flush any packets still pending
3397  * Called with irqs disabled.
3398  */
3399 static void flush_backlog(void *arg)
3400 {
3401 	struct net_device *dev = arg;
3402 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3403 	struct sk_buff *skb, *tmp;
3404 
3405 	rps_lock(sd);
3406 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3407 		if (skb->dev == dev) {
3408 			__skb_unlink(skb, &sd->input_pkt_queue);
3409 			kfree_skb(skb);
3410 			input_queue_head_incr(sd);
3411 		}
3412 	}
3413 	rps_unlock(sd);
3414 
3415 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3416 		if (skb->dev == dev) {
3417 			__skb_unlink(skb, &sd->process_queue);
3418 			kfree_skb(skb);
3419 			input_queue_head_incr(sd);
3420 		}
3421 	}
3422 }
3423 
3424 static int napi_gro_complete(struct sk_buff *skb)
3425 {
3426 	struct packet_type *ptype;
3427 	__be16 type = skb->protocol;
3428 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3429 	int err = -ENOENT;
3430 
3431 	if (NAPI_GRO_CB(skb)->count == 1) {
3432 		skb_shinfo(skb)->gso_size = 0;
3433 		goto out;
3434 	}
3435 
3436 	rcu_read_lock();
3437 	list_for_each_entry_rcu(ptype, head, list) {
3438 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3439 			continue;
3440 
3441 		err = ptype->gro_complete(skb);
3442 		break;
3443 	}
3444 	rcu_read_unlock();
3445 
3446 	if (err) {
3447 		WARN_ON(&ptype->list == head);
3448 		kfree_skb(skb);
3449 		return NET_RX_SUCCESS;
3450 	}
3451 
3452 out:
3453 	return netif_receive_skb(skb);
3454 }
3455 
3456 inline void napi_gro_flush(struct napi_struct *napi)
3457 {
3458 	struct sk_buff *skb, *next;
3459 
3460 	for (skb = napi->gro_list; skb; skb = next) {
3461 		next = skb->next;
3462 		skb->next = NULL;
3463 		napi_gro_complete(skb);
3464 	}
3465 
3466 	napi->gro_count = 0;
3467 	napi->gro_list = NULL;
3468 }
3469 EXPORT_SYMBOL(napi_gro_flush);
3470 
3471 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3472 {
3473 	struct sk_buff **pp = NULL;
3474 	struct packet_type *ptype;
3475 	__be16 type = skb->protocol;
3476 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3477 	int same_flow;
3478 	int mac_len;
3479 	enum gro_result ret;
3480 
3481 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3482 		goto normal;
3483 
3484 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3485 		goto normal;
3486 
3487 	rcu_read_lock();
3488 	list_for_each_entry_rcu(ptype, head, list) {
3489 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3490 			continue;
3491 
3492 		skb_set_network_header(skb, skb_gro_offset(skb));
3493 		mac_len = skb->network_header - skb->mac_header;
3494 		skb->mac_len = mac_len;
3495 		NAPI_GRO_CB(skb)->same_flow = 0;
3496 		NAPI_GRO_CB(skb)->flush = 0;
3497 		NAPI_GRO_CB(skb)->free = 0;
3498 
3499 		pp = ptype->gro_receive(&napi->gro_list, skb);
3500 		break;
3501 	}
3502 	rcu_read_unlock();
3503 
3504 	if (&ptype->list == head)
3505 		goto normal;
3506 
3507 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3508 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3509 
3510 	if (pp) {
3511 		struct sk_buff *nskb = *pp;
3512 
3513 		*pp = nskb->next;
3514 		nskb->next = NULL;
3515 		napi_gro_complete(nskb);
3516 		napi->gro_count--;
3517 	}
3518 
3519 	if (same_flow)
3520 		goto ok;
3521 
3522 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3523 		goto normal;
3524 
3525 	napi->gro_count++;
3526 	NAPI_GRO_CB(skb)->count = 1;
3527 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3528 	skb->next = napi->gro_list;
3529 	napi->gro_list = skb;
3530 	ret = GRO_HELD;
3531 
3532 pull:
3533 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3534 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3535 
3536 		BUG_ON(skb->end - skb->tail < grow);
3537 
3538 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3539 
3540 		skb->tail += grow;
3541 		skb->data_len -= grow;
3542 
3543 		skb_shinfo(skb)->frags[0].page_offset += grow;
3544 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3545 
3546 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3547 			skb_frag_unref(skb, 0);
3548 			memmove(skb_shinfo(skb)->frags,
3549 				skb_shinfo(skb)->frags + 1,
3550 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3551 		}
3552 	}
3553 
3554 ok:
3555 	return ret;
3556 
3557 normal:
3558 	ret = GRO_NORMAL;
3559 	goto pull;
3560 }
3561 EXPORT_SYMBOL(dev_gro_receive);
3562 
3563 static inline gro_result_t
3564 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3565 {
3566 	struct sk_buff *p;
3567 
3568 	for (p = napi->gro_list; p; p = p->next) {
3569 		unsigned long diffs;
3570 
3571 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3572 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3573 		diffs |= compare_ether_header(skb_mac_header(p),
3574 					      skb_gro_mac_header(skb));
3575 		NAPI_GRO_CB(p)->same_flow = !diffs;
3576 		NAPI_GRO_CB(p)->flush = 0;
3577 	}
3578 
3579 	return dev_gro_receive(napi, skb);
3580 }
3581 
3582 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3583 {
3584 	switch (ret) {
3585 	case GRO_NORMAL:
3586 		if (netif_receive_skb(skb))
3587 			ret = GRO_DROP;
3588 		break;
3589 
3590 	case GRO_DROP:
3591 	case GRO_MERGED_FREE:
3592 		kfree_skb(skb);
3593 		break;
3594 
3595 	case GRO_HELD:
3596 	case GRO_MERGED:
3597 		break;
3598 	}
3599 
3600 	return ret;
3601 }
3602 EXPORT_SYMBOL(napi_skb_finish);
3603 
3604 void skb_gro_reset_offset(struct sk_buff *skb)
3605 {
3606 	NAPI_GRO_CB(skb)->data_offset = 0;
3607 	NAPI_GRO_CB(skb)->frag0 = NULL;
3608 	NAPI_GRO_CB(skb)->frag0_len = 0;
3609 
3610 	if (skb->mac_header == skb->tail &&
3611 	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3612 		NAPI_GRO_CB(skb)->frag0 =
3613 			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3614 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3615 	}
3616 }
3617 EXPORT_SYMBOL(skb_gro_reset_offset);
3618 
3619 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3620 {
3621 	skb_gro_reset_offset(skb);
3622 
3623 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3624 }
3625 EXPORT_SYMBOL(napi_gro_receive);
3626 
3627 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3628 {
3629 	__skb_pull(skb, skb_headlen(skb));
3630 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3631 	skb->vlan_tci = 0;
3632 	skb->dev = napi->dev;
3633 	skb->skb_iif = 0;
3634 
3635 	napi->skb = skb;
3636 }
3637 
3638 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3639 {
3640 	struct sk_buff *skb = napi->skb;
3641 
3642 	if (!skb) {
3643 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3644 		if (skb)
3645 			napi->skb = skb;
3646 	}
3647 	return skb;
3648 }
3649 EXPORT_SYMBOL(napi_get_frags);
3650 
3651 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3652 			       gro_result_t ret)
3653 {
3654 	switch (ret) {
3655 	case GRO_NORMAL:
3656 	case GRO_HELD:
3657 		skb->protocol = eth_type_trans(skb, skb->dev);
3658 
3659 		if (ret == GRO_HELD)
3660 			skb_gro_pull(skb, -ETH_HLEN);
3661 		else if (netif_receive_skb(skb))
3662 			ret = GRO_DROP;
3663 		break;
3664 
3665 	case GRO_DROP:
3666 	case GRO_MERGED_FREE:
3667 		napi_reuse_skb(napi, skb);
3668 		break;
3669 
3670 	case GRO_MERGED:
3671 		break;
3672 	}
3673 
3674 	return ret;
3675 }
3676 EXPORT_SYMBOL(napi_frags_finish);
3677 
3678 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3679 {
3680 	struct sk_buff *skb = napi->skb;
3681 	struct ethhdr *eth;
3682 	unsigned int hlen;
3683 	unsigned int off;
3684 
3685 	napi->skb = NULL;
3686 
3687 	skb_reset_mac_header(skb);
3688 	skb_gro_reset_offset(skb);
3689 
3690 	off = skb_gro_offset(skb);
3691 	hlen = off + sizeof(*eth);
3692 	eth = skb_gro_header_fast(skb, off);
3693 	if (skb_gro_header_hard(skb, hlen)) {
3694 		eth = skb_gro_header_slow(skb, hlen, off);
3695 		if (unlikely(!eth)) {
3696 			napi_reuse_skb(napi, skb);
3697 			skb = NULL;
3698 			goto out;
3699 		}
3700 	}
3701 
3702 	skb_gro_pull(skb, sizeof(*eth));
3703 
3704 	/*
3705 	 * This works because the only protocols we care about don't require
3706 	 * special handling.  We'll fix it up properly at the end.
3707 	 */
3708 	skb->protocol = eth->h_proto;
3709 
3710 out:
3711 	return skb;
3712 }
3713 EXPORT_SYMBOL(napi_frags_skb);
3714 
3715 gro_result_t napi_gro_frags(struct napi_struct *napi)
3716 {
3717 	struct sk_buff *skb = napi_frags_skb(napi);
3718 
3719 	if (!skb)
3720 		return GRO_DROP;
3721 
3722 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3723 }
3724 EXPORT_SYMBOL(napi_gro_frags);
3725 
3726 /*
3727  * net_rps_action sends any pending IPI's for rps.
3728  * Note: called with local irq disabled, but exits with local irq enabled.
3729  */
3730 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3731 {
3732 #ifdef CONFIG_RPS
3733 	struct softnet_data *remsd = sd->rps_ipi_list;
3734 
3735 	if (remsd) {
3736 		sd->rps_ipi_list = NULL;
3737 
3738 		local_irq_enable();
3739 
3740 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3741 		while (remsd) {
3742 			struct softnet_data *next = remsd->rps_ipi_next;
3743 
3744 			if (cpu_online(remsd->cpu))
3745 				__smp_call_function_single(remsd->cpu,
3746 							   &remsd->csd, 0);
3747 			remsd = next;
3748 		}
3749 	} else
3750 #endif
3751 		local_irq_enable();
3752 }
3753 
3754 static int process_backlog(struct napi_struct *napi, int quota)
3755 {
3756 	int work = 0;
3757 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3758 
3759 #ifdef CONFIG_RPS
3760 	/* Check if we have pending ipi, its better to send them now,
3761 	 * not waiting net_rx_action() end.
3762 	 */
3763 	if (sd->rps_ipi_list) {
3764 		local_irq_disable();
3765 		net_rps_action_and_irq_enable(sd);
3766 	}
3767 #endif
3768 	napi->weight = weight_p;
3769 	local_irq_disable();
3770 	while (work < quota) {
3771 		struct sk_buff *skb;
3772 		unsigned int qlen;
3773 
3774 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3775 			local_irq_enable();
3776 			__netif_receive_skb(skb);
3777 			local_irq_disable();
3778 			input_queue_head_incr(sd);
3779 			if (++work >= quota) {
3780 				local_irq_enable();
3781 				return work;
3782 			}
3783 		}
3784 
3785 		rps_lock(sd);
3786 		qlen = skb_queue_len(&sd->input_pkt_queue);
3787 		if (qlen)
3788 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3789 						   &sd->process_queue);
3790 
3791 		if (qlen < quota - work) {
3792 			/*
3793 			 * Inline a custom version of __napi_complete().
3794 			 * only current cpu owns and manipulates this napi,
3795 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3796 			 * we can use a plain write instead of clear_bit(),
3797 			 * and we dont need an smp_mb() memory barrier.
3798 			 */
3799 			list_del(&napi->poll_list);
3800 			napi->state = 0;
3801 
3802 			quota = work + qlen;
3803 		}
3804 		rps_unlock(sd);
3805 	}
3806 	local_irq_enable();
3807 
3808 	return work;
3809 }
3810 
3811 /**
3812  * __napi_schedule - schedule for receive
3813  * @n: entry to schedule
3814  *
3815  * The entry's receive function will be scheduled to run
3816  */
3817 void __napi_schedule(struct napi_struct *n)
3818 {
3819 	unsigned long flags;
3820 
3821 	local_irq_save(flags);
3822 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3823 	local_irq_restore(flags);
3824 }
3825 EXPORT_SYMBOL(__napi_schedule);
3826 
3827 void __napi_complete(struct napi_struct *n)
3828 {
3829 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3830 	BUG_ON(n->gro_list);
3831 
3832 	list_del(&n->poll_list);
3833 	smp_mb__before_clear_bit();
3834 	clear_bit(NAPI_STATE_SCHED, &n->state);
3835 }
3836 EXPORT_SYMBOL(__napi_complete);
3837 
3838 void napi_complete(struct napi_struct *n)
3839 {
3840 	unsigned long flags;
3841 
3842 	/*
3843 	 * don't let napi dequeue from the cpu poll list
3844 	 * just in case its running on a different cpu
3845 	 */
3846 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3847 		return;
3848 
3849 	napi_gro_flush(n);
3850 	local_irq_save(flags);
3851 	__napi_complete(n);
3852 	local_irq_restore(flags);
3853 }
3854 EXPORT_SYMBOL(napi_complete);
3855 
3856 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3857 		    int (*poll)(struct napi_struct *, int), int weight)
3858 {
3859 	INIT_LIST_HEAD(&napi->poll_list);
3860 	napi->gro_count = 0;
3861 	napi->gro_list = NULL;
3862 	napi->skb = NULL;
3863 	napi->poll = poll;
3864 	napi->weight = weight;
3865 	list_add(&napi->dev_list, &dev->napi_list);
3866 	napi->dev = dev;
3867 #ifdef CONFIG_NETPOLL
3868 	spin_lock_init(&napi->poll_lock);
3869 	napi->poll_owner = -1;
3870 #endif
3871 	set_bit(NAPI_STATE_SCHED, &napi->state);
3872 }
3873 EXPORT_SYMBOL(netif_napi_add);
3874 
3875 void netif_napi_del(struct napi_struct *napi)
3876 {
3877 	struct sk_buff *skb, *next;
3878 
3879 	list_del_init(&napi->dev_list);
3880 	napi_free_frags(napi);
3881 
3882 	for (skb = napi->gro_list; skb; skb = next) {
3883 		next = skb->next;
3884 		skb->next = NULL;
3885 		kfree_skb(skb);
3886 	}
3887 
3888 	napi->gro_list = NULL;
3889 	napi->gro_count = 0;
3890 }
3891 EXPORT_SYMBOL(netif_napi_del);
3892 
3893 static void net_rx_action(struct softirq_action *h)
3894 {
3895 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3896 	unsigned long time_limit = jiffies + 2;
3897 	int budget = netdev_budget;
3898 	void *have;
3899 
3900 	local_irq_disable();
3901 
3902 	while (!list_empty(&sd->poll_list)) {
3903 		struct napi_struct *n;
3904 		int work, weight;
3905 
3906 		/* If softirq window is exhuasted then punt.
3907 		 * Allow this to run for 2 jiffies since which will allow
3908 		 * an average latency of 1.5/HZ.
3909 		 */
3910 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3911 			goto softnet_break;
3912 
3913 		local_irq_enable();
3914 
3915 		/* Even though interrupts have been re-enabled, this
3916 		 * access is safe because interrupts can only add new
3917 		 * entries to the tail of this list, and only ->poll()
3918 		 * calls can remove this head entry from the list.
3919 		 */
3920 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3921 
3922 		have = netpoll_poll_lock(n);
3923 
3924 		weight = n->weight;
3925 
3926 		/* This NAPI_STATE_SCHED test is for avoiding a race
3927 		 * with netpoll's poll_napi().  Only the entity which
3928 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3929 		 * actually make the ->poll() call.  Therefore we avoid
3930 		 * accidentally calling ->poll() when NAPI is not scheduled.
3931 		 */
3932 		work = 0;
3933 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3934 			work = n->poll(n, weight);
3935 			trace_napi_poll(n);
3936 		}
3937 
3938 		WARN_ON_ONCE(work > weight);
3939 
3940 		budget -= work;
3941 
3942 		local_irq_disable();
3943 
3944 		/* Drivers must not modify the NAPI state if they
3945 		 * consume the entire weight.  In such cases this code
3946 		 * still "owns" the NAPI instance and therefore can
3947 		 * move the instance around on the list at-will.
3948 		 */
3949 		if (unlikely(work == weight)) {
3950 			if (unlikely(napi_disable_pending(n))) {
3951 				local_irq_enable();
3952 				napi_complete(n);
3953 				local_irq_disable();
3954 			} else
3955 				list_move_tail(&n->poll_list, &sd->poll_list);
3956 		}
3957 
3958 		netpoll_poll_unlock(have);
3959 	}
3960 out:
3961 	net_rps_action_and_irq_enable(sd);
3962 
3963 #ifdef CONFIG_NET_DMA
3964 	/*
3965 	 * There may not be any more sk_buffs coming right now, so push
3966 	 * any pending DMA copies to hardware
3967 	 */
3968 	dma_issue_pending_all();
3969 #endif
3970 
3971 	return;
3972 
3973 softnet_break:
3974 	sd->time_squeeze++;
3975 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3976 	goto out;
3977 }
3978 
3979 static gifconf_func_t *gifconf_list[NPROTO];
3980 
3981 /**
3982  *	register_gifconf	-	register a SIOCGIF handler
3983  *	@family: Address family
3984  *	@gifconf: Function handler
3985  *
3986  *	Register protocol dependent address dumping routines. The handler
3987  *	that is passed must not be freed or reused until it has been replaced
3988  *	by another handler.
3989  */
3990 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3991 {
3992 	if (family >= NPROTO)
3993 		return -EINVAL;
3994 	gifconf_list[family] = gifconf;
3995 	return 0;
3996 }
3997 EXPORT_SYMBOL(register_gifconf);
3998 
3999 
4000 /*
4001  *	Map an interface index to its name (SIOCGIFNAME)
4002  */
4003 
4004 /*
4005  *	We need this ioctl for efficient implementation of the
4006  *	if_indextoname() function required by the IPv6 API.  Without
4007  *	it, we would have to search all the interfaces to find a
4008  *	match.  --pb
4009  */
4010 
4011 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4012 {
4013 	struct net_device *dev;
4014 	struct ifreq ifr;
4015 
4016 	/*
4017 	 *	Fetch the caller's info block.
4018 	 */
4019 
4020 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4021 		return -EFAULT;
4022 
4023 	rcu_read_lock();
4024 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4025 	if (!dev) {
4026 		rcu_read_unlock();
4027 		return -ENODEV;
4028 	}
4029 
4030 	strcpy(ifr.ifr_name, dev->name);
4031 	rcu_read_unlock();
4032 
4033 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4034 		return -EFAULT;
4035 	return 0;
4036 }
4037 
4038 /*
4039  *	Perform a SIOCGIFCONF call. This structure will change
4040  *	size eventually, and there is nothing I can do about it.
4041  *	Thus we will need a 'compatibility mode'.
4042  */
4043 
4044 static int dev_ifconf(struct net *net, char __user *arg)
4045 {
4046 	struct ifconf ifc;
4047 	struct net_device *dev;
4048 	char __user *pos;
4049 	int len;
4050 	int total;
4051 	int i;
4052 
4053 	/*
4054 	 *	Fetch the caller's info block.
4055 	 */
4056 
4057 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4058 		return -EFAULT;
4059 
4060 	pos = ifc.ifc_buf;
4061 	len = ifc.ifc_len;
4062 
4063 	/*
4064 	 *	Loop over the interfaces, and write an info block for each.
4065 	 */
4066 
4067 	total = 0;
4068 	for_each_netdev(net, dev) {
4069 		for (i = 0; i < NPROTO; i++) {
4070 			if (gifconf_list[i]) {
4071 				int done;
4072 				if (!pos)
4073 					done = gifconf_list[i](dev, NULL, 0);
4074 				else
4075 					done = gifconf_list[i](dev, pos + total,
4076 							       len - total);
4077 				if (done < 0)
4078 					return -EFAULT;
4079 				total += done;
4080 			}
4081 		}
4082 	}
4083 
4084 	/*
4085 	 *	All done.  Write the updated control block back to the caller.
4086 	 */
4087 	ifc.ifc_len = total;
4088 
4089 	/*
4090 	 * 	Both BSD and Solaris return 0 here, so we do too.
4091 	 */
4092 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4093 }
4094 
4095 #ifdef CONFIG_PROC_FS
4096 
4097 #define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4098 
4099 struct dev_iter_state {
4100 	struct seq_net_private p;
4101 	unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4102 };
4103 
4104 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4105 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4106 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4107 
4108 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4109 {
4110 	struct dev_iter_state *state = seq->private;
4111 	struct net *net = seq_file_net(seq);
4112 	struct net_device *dev;
4113 	struct hlist_node *p;
4114 	struct hlist_head *h;
4115 	unsigned int count, bucket, offset;
4116 
4117 	bucket = get_bucket(state->pos);
4118 	offset = get_offset(state->pos);
4119 	h = &net->dev_name_head[bucket];
4120 	count = 0;
4121 	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4122 		if (count++ == offset) {
4123 			state->pos = set_bucket_offset(bucket, count);
4124 			return dev;
4125 		}
4126 	}
4127 
4128 	return NULL;
4129 }
4130 
4131 static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4132 {
4133 	struct dev_iter_state *state = seq->private;
4134 	struct net_device *dev;
4135 	unsigned int bucket;
4136 
4137 	bucket = get_bucket(state->pos);
4138 	do {
4139 		dev = dev_from_same_bucket(seq);
4140 		if (dev)
4141 			return dev;
4142 
4143 		bucket++;
4144 		state->pos = set_bucket_offset(bucket, 0);
4145 	} while (bucket < NETDEV_HASHENTRIES);
4146 
4147 	return NULL;
4148 }
4149 
4150 /*
4151  *	This is invoked by the /proc filesystem handler to display a device
4152  *	in detail.
4153  */
4154 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4155 	__acquires(RCU)
4156 {
4157 	struct dev_iter_state *state = seq->private;
4158 
4159 	rcu_read_lock();
4160 	if (!*pos)
4161 		return SEQ_START_TOKEN;
4162 
4163 	/* check for end of the hash */
4164 	if (state->pos == 0 && *pos > 1)
4165 		return NULL;
4166 
4167 	return dev_from_new_bucket(seq);
4168 }
4169 
4170 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4171 {
4172 	struct net_device *dev;
4173 
4174 	++*pos;
4175 
4176 	if (v == SEQ_START_TOKEN)
4177 		return dev_from_new_bucket(seq);
4178 
4179 	dev = dev_from_same_bucket(seq);
4180 	if (dev)
4181 		return dev;
4182 
4183 	return dev_from_new_bucket(seq);
4184 }
4185 
4186 void dev_seq_stop(struct seq_file *seq, void *v)
4187 	__releases(RCU)
4188 {
4189 	rcu_read_unlock();
4190 }
4191 
4192 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4193 {
4194 	struct rtnl_link_stats64 temp;
4195 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4196 
4197 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4198 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4199 		   dev->name, stats->rx_bytes, stats->rx_packets,
4200 		   stats->rx_errors,
4201 		   stats->rx_dropped + stats->rx_missed_errors,
4202 		   stats->rx_fifo_errors,
4203 		   stats->rx_length_errors + stats->rx_over_errors +
4204 		    stats->rx_crc_errors + stats->rx_frame_errors,
4205 		   stats->rx_compressed, stats->multicast,
4206 		   stats->tx_bytes, stats->tx_packets,
4207 		   stats->tx_errors, stats->tx_dropped,
4208 		   stats->tx_fifo_errors, stats->collisions,
4209 		   stats->tx_carrier_errors +
4210 		    stats->tx_aborted_errors +
4211 		    stats->tx_window_errors +
4212 		    stats->tx_heartbeat_errors,
4213 		   stats->tx_compressed);
4214 }
4215 
4216 /*
4217  *	Called from the PROCfs module. This now uses the new arbitrary sized
4218  *	/proc/net interface to create /proc/net/dev
4219  */
4220 static int dev_seq_show(struct seq_file *seq, void *v)
4221 {
4222 	if (v == SEQ_START_TOKEN)
4223 		seq_puts(seq, "Inter-|   Receive                            "
4224 			      "                    |  Transmit\n"
4225 			      " face |bytes    packets errs drop fifo frame "
4226 			      "compressed multicast|bytes    packets errs "
4227 			      "drop fifo colls carrier compressed\n");
4228 	else
4229 		dev_seq_printf_stats(seq, v);
4230 	return 0;
4231 }
4232 
4233 static struct softnet_data *softnet_get_online(loff_t *pos)
4234 {
4235 	struct softnet_data *sd = NULL;
4236 
4237 	while (*pos < nr_cpu_ids)
4238 		if (cpu_online(*pos)) {
4239 			sd = &per_cpu(softnet_data, *pos);
4240 			break;
4241 		} else
4242 			++*pos;
4243 	return sd;
4244 }
4245 
4246 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4247 {
4248 	return softnet_get_online(pos);
4249 }
4250 
4251 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4252 {
4253 	++*pos;
4254 	return softnet_get_online(pos);
4255 }
4256 
4257 static void softnet_seq_stop(struct seq_file *seq, void *v)
4258 {
4259 }
4260 
4261 static int softnet_seq_show(struct seq_file *seq, void *v)
4262 {
4263 	struct softnet_data *sd = v;
4264 
4265 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4266 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4267 		   0, 0, 0, 0, /* was fastroute */
4268 		   sd->cpu_collision, sd->received_rps);
4269 	return 0;
4270 }
4271 
4272 static const struct seq_operations dev_seq_ops = {
4273 	.start = dev_seq_start,
4274 	.next  = dev_seq_next,
4275 	.stop  = dev_seq_stop,
4276 	.show  = dev_seq_show,
4277 };
4278 
4279 static int dev_seq_open(struct inode *inode, struct file *file)
4280 {
4281 	return seq_open_net(inode, file, &dev_seq_ops,
4282 			    sizeof(struct dev_iter_state));
4283 }
4284 
4285 static const struct file_operations dev_seq_fops = {
4286 	.owner	 = THIS_MODULE,
4287 	.open    = dev_seq_open,
4288 	.read    = seq_read,
4289 	.llseek  = seq_lseek,
4290 	.release = seq_release_net,
4291 };
4292 
4293 static const struct seq_operations softnet_seq_ops = {
4294 	.start = softnet_seq_start,
4295 	.next  = softnet_seq_next,
4296 	.stop  = softnet_seq_stop,
4297 	.show  = softnet_seq_show,
4298 };
4299 
4300 static int softnet_seq_open(struct inode *inode, struct file *file)
4301 {
4302 	return seq_open(file, &softnet_seq_ops);
4303 }
4304 
4305 static const struct file_operations softnet_seq_fops = {
4306 	.owner	 = THIS_MODULE,
4307 	.open    = softnet_seq_open,
4308 	.read    = seq_read,
4309 	.llseek  = seq_lseek,
4310 	.release = seq_release,
4311 };
4312 
4313 static void *ptype_get_idx(loff_t pos)
4314 {
4315 	struct packet_type *pt = NULL;
4316 	loff_t i = 0;
4317 	int t;
4318 
4319 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4320 		if (i == pos)
4321 			return pt;
4322 		++i;
4323 	}
4324 
4325 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4326 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4327 			if (i == pos)
4328 				return pt;
4329 			++i;
4330 		}
4331 	}
4332 	return NULL;
4333 }
4334 
4335 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4336 	__acquires(RCU)
4337 {
4338 	rcu_read_lock();
4339 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4340 }
4341 
4342 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4343 {
4344 	struct packet_type *pt;
4345 	struct list_head *nxt;
4346 	int hash;
4347 
4348 	++*pos;
4349 	if (v == SEQ_START_TOKEN)
4350 		return ptype_get_idx(0);
4351 
4352 	pt = v;
4353 	nxt = pt->list.next;
4354 	if (pt->type == htons(ETH_P_ALL)) {
4355 		if (nxt != &ptype_all)
4356 			goto found;
4357 		hash = 0;
4358 		nxt = ptype_base[0].next;
4359 	} else
4360 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4361 
4362 	while (nxt == &ptype_base[hash]) {
4363 		if (++hash >= PTYPE_HASH_SIZE)
4364 			return NULL;
4365 		nxt = ptype_base[hash].next;
4366 	}
4367 found:
4368 	return list_entry(nxt, struct packet_type, list);
4369 }
4370 
4371 static void ptype_seq_stop(struct seq_file *seq, void *v)
4372 	__releases(RCU)
4373 {
4374 	rcu_read_unlock();
4375 }
4376 
4377 static int ptype_seq_show(struct seq_file *seq, void *v)
4378 {
4379 	struct packet_type *pt = v;
4380 
4381 	if (v == SEQ_START_TOKEN)
4382 		seq_puts(seq, "Type Device      Function\n");
4383 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4384 		if (pt->type == htons(ETH_P_ALL))
4385 			seq_puts(seq, "ALL ");
4386 		else
4387 			seq_printf(seq, "%04x", ntohs(pt->type));
4388 
4389 		seq_printf(seq, " %-8s %pF\n",
4390 			   pt->dev ? pt->dev->name : "", pt->func);
4391 	}
4392 
4393 	return 0;
4394 }
4395 
4396 static const struct seq_operations ptype_seq_ops = {
4397 	.start = ptype_seq_start,
4398 	.next  = ptype_seq_next,
4399 	.stop  = ptype_seq_stop,
4400 	.show  = ptype_seq_show,
4401 };
4402 
4403 static int ptype_seq_open(struct inode *inode, struct file *file)
4404 {
4405 	return seq_open_net(inode, file, &ptype_seq_ops,
4406 			sizeof(struct seq_net_private));
4407 }
4408 
4409 static const struct file_operations ptype_seq_fops = {
4410 	.owner	 = THIS_MODULE,
4411 	.open    = ptype_seq_open,
4412 	.read    = seq_read,
4413 	.llseek  = seq_lseek,
4414 	.release = seq_release_net,
4415 };
4416 
4417 
4418 static int __net_init dev_proc_net_init(struct net *net)
4419 {
4420 	int rc = -ENOMEM;
4421 
4422 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4423 		goto out;
4424 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4425 		goto out_dev;
4426 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4427 		goto out_softnet;
4428 
4429 	if (wext_proc_init(net))
4430 		goto out_ptype;
4431 	rc = 0;
4432 out:
4433 	return rc;
4434 out_ptype:
4435 	proc_net_remove(net, "ptype");
4436 out_softnet:
4437 	proc_net_remove(net, "softnet_stat");
4438 out_dev:
4439 	proc_net_remove(net, "dev");
4440 	goto out;
4441 }
4442 
4443 static void __net_exit dev_proc_net_exit(struct net *net)
4444 {
4445 	wext_proc_exit(net);
4446 
4447 	proc_net_remove(net, "ptype");
4448 	proc_net_remove(net, "softnet_stat");
4449 	proc_net_remove(net, "dev");
4450 }
4451 
4452 static struct pernet_operations __net_initdata dev_proc_ops = {
4453 	.init = dev_proc_net_init,
4454 	.exit = dev_proc_net_exit,
4455 };
4456 
4457 static int __init dev_proc_init(void)
4458 {
4459 	return register_pernet_subsys(&dev_proc_ops);
4460 }
4461 #else
4462 #define dev_proc_init() 0
4463 #endif	/* CONFIG_PROC_FS */
4464 
4465 
4466 /**
4467  *	netdev_set_master	-	set up master pointer
4468  *	@slave: slave device
4469  *	@master: new master device
4470  *
4471  *	Changes the master device of the slave. Pass %NULL to break the
4472  *	bonding. The caller must hold the RTNL semaphore. On a failure
4473  *	a negative errno code is returned. On success the reference counts
4474  *	are adjusted and the function returns zero.
4475  */
4476 int netdev_set_master(struct net_device *slave, struct net_device *master)
4477 {
4478 	struct net_device *old = slave->master;
4479 
4480 	ASSERT_RTNL();
4481 
4482 	if (master) {
4483 		if (old)
4484 			return -EBUSY;
4485 		dev_hold(master);
4486 	}
4487 
4488 	slave->master = master;
4489 
4490 	if (old)
4491 		dev_put(old);
4492 	return 0;
4493 }
4494 EXPORT_SYMBOL(netdev_set_master);
4495 
4496 /**
4497  *	netdev_set_bond_master	-	set up bonding master/slave pair
4498  *	@slave: slave device
4499  *	@master: new master device
4500  *
4501  *	Changes the master device of the slave. Pass %NULL to break the
4502  *	bonding. The caller must hold the RTNL semaphore. On a failure
4503  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4504  *	to the routing socket and the function returns zero.
4505  */
4506 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4507 {
4508 	int err;
4509 
4510 	ASSERT_RTNL();
4511 
4512 	err = netdev_set_master(slave, master);
4513 	if (err)
4514 		return err;
4515 	if (master)
4516 		slave->flags |= IFF_SLAVE;
4517 	else
4518 		slave->flags &= ~IFF_SLAVE;
4519 
4520 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4521 	return 0;
4522 }
4523 EXPORT_SYMBOL(netdev_set_bond_master);
4524 
4525 static void dev_change_rx_flags(struct net_device *dev, int flags)
4526 {
4527 	const struct net_device_ops *ops = dev->netdev_ops;
4528 
4529 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4530 		ops->ndo_change_rx_flags(dev, flags);
4531 }
4532 
4533 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4534 {
4535 	unsigned short old_flags = dev->flags;
4536 	uid_t uid;
4537 	gid_t gid;
4538 
4539 	ASSERT_RTNL();
4540 
4541 	dev->flags |= IFF_PROMISC;
4542 	dev->promiscuity += inc;
4543 	if (dev->promiscuity == 0) {
4544 		/*
4545 		 * Avoid overflow.
4546 		 * If inc causes overflow, untouch promisc and return error.
4547 		 */
4548 		if (inc < 0)
4549 			dev->flags &= ~IFF_PROMISC;
4550 		else {
4551 			dev->promiscuity -= inc;
4552 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4553 				"set promiscuity failed, promiscuity feature "
4554 				"of device might be broken.\n", dev->name);
4555 			return -EOVERFLOW;
4556 		}
4557 	}
4558 	if (dev->flags != old_flags) {
4559 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4560 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4561 							       "left");
4562 		if (audit_enabled) {
4563 			current_uid_gid(&uid, &gid);
4564 			audit_log(current->audit_context, GFP_ATOMIC,
4565 				AUDIT_ANOM_PROMISCUOUS,
4566 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4567 				dev->name, (dev->flags & IFF_PROMISC),
4568 				(old_flags & IFF_PROMISC),
4569 				audit_get_loginuid(current),
4570 				uid, gid,
4571 				audit_get_sessionid(current));
4572 		}
4573 
4574 		dev_change_rx_flags(dev, IFF_PROMISC);
4575 	}
4576 	return 0;
4577 }
4578 
4579 /**
4580  *	dev_set_promiscuity	- update promiscuity count on a device
4581  *	@dev: device
4582  *	@inc: modifier
4583  *
4584  *	Add or remove promiscuity from a device. While the count in the device
4585  *	remains above zero the interface remains promiscuous. Once it hits zero
4586  *	the device reverts back to normal filtering operation. A negative inc
4587  *	value is used to drop promiscuity on the device.
4588  *	Return 0 if successful or a negative errno code on error.
4589  */
4590 int dev_set_promiscuity(struct net_device *dev, int inc)
4591 {
4592 	unsigned short old_flags = dev->flags;
4593 	int err;
4594 
4595 	err = __dev_set_promiscuity(dev, inc);
4596 	if (err < 0)
4597 		return err;
4598 	if (dev->flags != old_flags)
4599 		dev_set_rx_mode(dev);
4600 	return err;
4601 }
4602 EXPORT_SYMBOL(dev_set_promiscuity);
4603 
4604 /**
4605  *	dev_set_allmulti	- update allmulti count on a device
4606  *	@dev: device
4607  *	@inc: modifier
4608  *
4609  *	Add or remove reception of all multicast frames to a device. While the
4610  *	count in the device remains above zero the interface remains listening
4611  *	to all interfaces. Once it hits zero the device reverts back to normal
4612  *	filtering operation. A negative @inc value is used to drop the counter
4613  *	when releasing a resource needing all multicasts.
4614  *	Return 0 if successful or a negative errno code on error.
4615  */
4616 
4617 int dev_set_allmulti(struct net_device *dev, int inc)
4618 {
4619 	unsigned short old_flags = dev->flags;
4620 
4621 	ASSERT_RTNL();
4622 
4623 	dev->flags |= IFF_ALLMULTI;
4624 	dev->allmulti += inc;
4625 	if (dev->allmulti == 0) {
4626 		/*
4627 		 * Avoid overflow.
4628 		 * If inc causes overflow, untouch allmulti and return error.
4629 		 */
4630 		if (inc < 0)
4631 			dev->flags &= ~IFF_ALLMULTI;
4632 		else {
4633 			dev->allmulti -= inc;
4634 			printk(KERN_WARNING "%s: allmulti touches roof, "
4635 				"set allmulti failed, allmulti feature of "
4636 				"device might be broken.\n", dev->name);
4637 			return -EOVERFLOW;
4638 		}
4639 	}
4640 	if (dev->flags ^ old_flags) {
4641 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4642 		dev_set_rx_mode(dev);
4643 	}
4644 	return 0;
4645 }
4646 EXPORT_SYMBOL(dev_set_allmulti);
4647 
4648 /*
4649  *	Upload unicast and multicast address lists to device and
4650  *	configure RX filtering. When the device doesn't support unicast
4651  *	filtering it is put in promiscuous mode while unicast addresses
4652  *	are present.
4653  */
4654 void __dev_set_rx_mode(struct net_device *dev)
4655 {
4656 	const struct net_device_ops *ops = dev->netdev_ops;
4657 
4658 	/* dev_open will call this function so the list will stay sane. */
4659 	if (!(dev->flags&IFF_UP))
4660 		return;
4661 
4662 	if (!netif_device_present(dev))
4663 		return;
4664 
4665 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4666 		/* Unicast addresses changes may only happen under the rtnl,
4667 		 * therefore calling __dev_set_promiscuity here is safe.
4668 		 */
4669 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4670 			__dev_set_promiscuity(dev, 1);
4671 			dev->uc_promisc = true;
4672 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4673 			__dev_set_promiscuity(dev, -1);
4674 			dev->uc_promisc = false;
4675 		}
4676 	}
4677 
4678 	if (ops->ndo_set_rx_mode)
4679 		ops->ndo_set_rx_mode(dev);
4680 }
4681 
4682 void dev_set_rx_mode(struct net_device *dev)
4683 {
4684 	netif_addr_lock_bh(dev);
4685 	__dev_set_rx_mode(dev);
4686 	netif_addr_unlock_bh(dev);
4687 }
4688 
4689 /**
4690  *	dev_get_flags - get flags reported to userspace
4691  *	@dev: device
4692  *
4693  *	Get the combination of flag bits exported through APIs to userspace.
4694  */
4695 unsigned dev_get_flags(const struct net_device *dev)
4696 {
4697 	unsigned flags;
4698 
4699 	flags = (dev->flags & ~(IFF_PROMISC |
4700 				IFF_ALLMULTI |
4701 				IFF_RUNNING |
4702 				IFF_LOWER_UP |
4703 				IFF_DORMANT)) |
4704 		(dev->gflags & (IFF_PROMISC |
4705 				IFF_ALLMULTI));
4706 
4707 	if (netif_running(dev)) {
4708 		if (netif_oper_up(dev))
4709 			flags |= IFF_RUNNING;
4710 		if (netif_carrier_ok(dev))
4711 			flags |= IFF_LOWER_UP;
4712 		if (netif_dormant(dev))
4713 			flags |= IFF_DORMANT;
4714 	}
4715 
4716 	return flags;
4717 }
4718 EXPORT_SYMBOL(dev_get_flags);
4719 
4720 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4721 {
4722 	int old_flags = dev->flags;
4723 	int ret;
4724 
4725 	ASSERT_RTNL();
4726 
4727 	/*
4728 	 *	Set the flags on our device.
4729 	 */
4730 
4731 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4732 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4733 			       IFF_AUTOMEDIA)) |
4734 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4735 				    IFF_ALLMULTI));
4736 
4737 	/*
4738 	 *	Load in the correct multicast list now the flags have changed.
4739 	 */
4740 
4741 	if ((old_flags ^ flags) & IFF_MULTICAST)
4742 		dev_change_rx_flags(dev, IFF_MULTICAST);
4743 
4744 	dev_set_rx_mode(dev);
4745 
4746 	/*
4747 	 *	Have we downed the interface. We handle IFF_UP ourselves
4748 	 *	according to user attempts to set it, rather than blindly
4749 	 *	setting it.
4750 	 */
4751 
4752 	ret = 0;
4753 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4754 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4755 
4756 		if (!ret)
4757 			dev_set_rx_mode(dev);
4758 	}
4759 
4760 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4761 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4762 
4763 		dev->gflags ^= IFF_PROMISC;
4764 		dev_set_promiscuity(dev, inc);
4765 	}
4766 
4767 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4768 	   is important. Some (broken) drivers set IFF_PROMISC, when
4769 	   IFF_ALLMULTI is requested not asking us and not reporting.
4770 	 */
4771 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4772 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4773 
4774 		dev->gflags ^= IFF_ALLMULTI;
4775 		dev_set_allmulti(dev, inc);
4776 	}
4777 
4778 	return ret;
4779 }
4780 
4781 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4782 {
4783 	unsigned int changes = dev->flags ^ old_flags;
4784 
4785 	if (changes & IFF_UP) {
4786 		if (dev->flags & IFF_UP)
4787 			call_netdevice_notifiers(NETDEV_UP, dev);
4788 		else
4789 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4790 	}
4791 
4792 	if (dev->flags & IFF_UP &&
4793 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4794 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4795 }
4796 
4797 /**
4798  *	dev_change_flags - change device settings
4799  *	@dev: device
4800  *	@flags: device state flags
4801  *
4802  *	Change settings on device based state flags. The flags are
4803  *	in the userspace exported format.
4804  */
4805 int dev_change_flags(struct net_device *dev, unsigned flags)
4806 {
4807 	int ret, changes;
4808 	int old_flags = dev->flags;
4809 
4810 	ret = __dev_change_flags(dev, flags);
4811 	if (ret < 0)
4812 		return ret;
4813 
4814 	changes = old_flags ^ dev->flags;
4815 	if (changes)
4816 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4817 
4818 	__dev_notify_flags(dev, old_flags);
4819 	return ret;
4820 }
4821 EXPORT_SYMBOL(dev_change_flags);
4822 
4823 /**
4824  *	dev_set_mtu - Change maximum transfer unit
4825  *	@dev: device
4826  *	@new_mtu: new transfer unit
4827  *
4828  *	Change the maximum transfer size of the network device.
4829  */
4830 int dev_set_mtu(struct net_device *dev, int new_mtu)
4831 {
4832 	const struct net_device_ops *ops = dev->netdev_ops;
4833 	int err;
4834 
4835 	if (new_mtu == dev->mtu)
4836 		return 0;
4837 
4838 	/*	MTU must be positive.	 */
4839 	if (new_mtu < 0)
4840 		return -EINVAL;
4841 
4842 	if (!netif_device_present(dev))
4843 		return -ENODEV;
4844 
4845 	err = 0;
4846 	if (ops->ndo_change_mtu)
4847 		err = ops->ndo_change_mtu(dev, new_mtu);
4848 	else
4849 		dev->mtu = new_mtu;
4850 
4851 	if (!err && dev->flags & IFF_UP)
4852 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4853 	return err;
4854 }
4855 EXPORT_SYMBOL(dev_set_mtu);
4856 
4857 /**
4858  *	dev_set_group - Change group this device belongs to
4859  *	@dev: device
4860  *	@new_group: group this device should belong to
4861  */
4862 void dev_set_group(struct net_device *dev, int new_group)
4863 {
4864 	dev->group = new_group;
4865 }
4866 EXPORT_SYMBOL(dev_set_group);
4867 
4868 /**
4869  *	dev_set_mac_address - Change Media Access Control Address
4870  *	@dev: device
4871  *	@sa: new address
4872  *
4873  *	Change the hardware (MAC) address of the device
4874  */
4875 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4876 {
4877 	const struct net_device_ops *ops = dev->netdev_ops;
4878 	int err;
4879 
4880 	if (!ops->ndo_set_mac_address)
4881 		return -EOPNOTSUPP;
4882 	if (sa->sa_family != dev->type)
4883 		return -EINVAL;
4884 	if (!netif_device_present(dev))
4885 		return -ENODEV;
4886 	err = ops->ndo_set_mac_address(dev, sa);
4887 	if (!err)
4888 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4889 	return err;
4890 }
4891 EXPORT_SYMBOL(dev_set_mac_address);
4892 
4893 /*
4894  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4895  */
4896 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4897 {
4898 	int err;
4899 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4900 
4901 	if (!dev)
4902 		return -ENODEV;
4903 
4904 	switch (cmd) {
4905 	case SIOCGIFFLAGS:	/* Get interface flags */
4906 		ifr->ifr_flags = (short) dev_get_flags(dev);
4907 		return 0;
4908 
4909 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4910 				   (currently unused) */
4911 		ifr->ifr_metric = 0;
4912 		return 0;
4913 
4914 	case SIOCGIFMTU:	/* Get the MTU of a device */
4915 		ifr->ifr_mtu = dev->mtu;
4916 		return 0;
4917 
4918 	case SIOCGIFHWADDR:
4919 		if (!dev->addr_len)
4920 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4921 		else
4922 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4923 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4924 		ifr->ifr_hwaddr.sa_family = dev->type;
4925 		return 0;
4926 
4927 	case SIOCGIFSLAVE:
4928 		err = -EINVAL;
4929 		break;
4930 
4931 	case SIOCGIFMAP:
4932 		ifr->ifr_map.mem_start = dev->mem_start;
4933 		ifr->ifr_map.mem_end   = dev->mem_end;
4934 		ifr->ifr_map.base_addr = dev->base_addr;
4935 		ifr->ifr_map.irq       = dev->irq;
4936 		ifr->ifr_map.dma       = dev->dma;
4937 		ifr->ifr_map.port      = dev->if_port;
4938 		return 0;
4939 
4940 	case SIOCGIFINDEX:
4941 		ifr->ifr_ifindex = dev->ifindex;
4942 		return 0;
4943 
4944 	case SIOCGIFTXQLEN:
4945 		ifr->ifr_qlen = dev->tx_queue_len;
4946 		return 0;
4947 
4948 	default:
4949 		/* dev_ioctl() should ensure this case
4950 		 * is never reached
4951 		 */
4952 		WARN_ON(1);
4953 		err = -ENOTTY;
4954 		break;
4955 
4956 	}
4957 	return err;
4958 }
4959 
4960 /*
4961  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4962  */
4963 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4964 {
4965 	int err;
4966 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4967 	const struct net_device_ops *ops;
4968 
4969 	if (!dev)
4970 		return -ENODEV;
4971 
4972 	ops = dev->netdev_ops;
4973 
4974 	switch (cmd) {
4975 	case SIOCSIFFLAGS:	/* Set interface flags */
4976 		return dev_change_flags(dev, ifr->ifr_flags);
4977 
4978 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4979 				   (currently unused) */
4980 		return -EOPNOTSUPP;
4981 
4982 	case SIOCSIFMTU:	/* Set the MTU of a device */
4983 		return dev_set_mtu(dev, ifr->ifr_mtu);
4984 
4985 	case SIOCSIFHWADDR:
4986 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4987 
4988 	case SIOCSIFHWBROADCAST:
4989 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4990 			return -EINVAL;
4991 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4992 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4993 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4994 		return 0;
4995 
4996 	case SIOCSIFMAP:
4997 		if (ops->ndo_set_config) {
4998 			if (!netif_device_present(dev))
4999 				return -ENODEV;
5000 			return ops->ndo_set_config(dev, &ifr->ifr_map);
5001 		}
5002 		return -EOPNOTSUPP;
5003 
5004 	case SIOCADDMULTI:
5005 		if (!ops->ndo_set_rx_mode ||
5006 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5007 			return -EINVAL;
5008 		if (!netif_device_present(dev))
5009 			return -ENODEV;
5010 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5011 
5012 	case SIOCDELMULTI:
5013 		if (!ops->ndo_set_rx_mode ||
5014 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5015 			return -EINVAL;
5016 		if (!netif_device_present(dev))
5017 			return -ENODEV;
5018 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5019 
5020 	case SIOCSIFTXQLEN:
5021 		if (ifr->ifr_qlen < 0)
5022 			return -EINVAL;
5023 		dev->tx_queue_len = ifr->ifr_qlen;
5024 		return 0;
5025 
5026 	case SIOCSIFNAME:
5027 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5028 		return dev_change_name(dev, ifr->ifr_newname);
5029 
5030 	case SIOCSHWTSTAMP:
5031 		err = net_hwtstamp_validate(ifr);
5032 		if (err)
5033 			return err;
5034 		/* fall through */
5035 
5036 	/*
5037 	 *	Unknown or private ioctl
5038 	 */
5039 	default:
5040 		if ((cmd >= SIOCDEVPRIVATE &&
5041 		    cmd <= SIOCDEVPRIVATE + 15) ||
5042 		    cmd == SIOCBONDENSLAVE ||
5043 		    cmd == SIOCBONDRELEASE ||
5044 		    cmd == SIOCBONDSETHWADDR ||
5045 		    cmd == SIOCBONDSLAVEINFOQUERY ||
5046 		    cmd == SIOCBONDINFOQUERY ||
5047 		    cmd == SIOCBONDCHANGEACTIVE ||
5048 		    cmd == SIOCGMIIPHY ||
5049 		    cmd == SIOCGMIIREG ||
5050 		    cmd == SIOCSMIIREG ||
5051 		    cmd == SIOCBRADDIF ||
5052 		    cmd == SIOCBRDELIF ||
5053 		    cmd == SIOCSHWTSTAMP ||
5054 		    cmd == SIOCWANDEV) {
5055 			err = -EOPNOTSUPP;
5056 			if (ops->ndo_do_ioctl) {
5057 				if (netif_device_present(dev))
5058 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
5059 				else
5060 					err = -ENODEV;
5061 			}
5062 		} else
5063 			err = -EINVAL;
5064 
5065 	}
5066 	return err;
5067 }
5068 
5069 /*
5070  *	This function handles all "interface"-type I/O control requests. The actual
5071  *	'doing' part of this is dev_ifsioc above.
5072  */
5073 
5074 /**
5075  *	dev_ioctl	-	network device ioctl
5076  *	@net: the applicable net namespace
5077  *	@cmd: command to issue
5078  *	@arg: pointer to a struct ifreq in user space
5079  *
5080  *	Issue ioctl functions to devices. This is normally called by the
5081  *	user space syscall interfaces but can sometimes be useful for
5082  *	other purposes. The return value is the return from the syscall if
5083  *	positive or a negative errno code on error.
5084  */
5085 
5086 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5087 {
5088 	struct ifreq ifr;
5089 	int ret;
5090 	char *colon;
5091 
5092 	/* One special case: SIOCGIFCONF takes ifconf argument
5093 	   and requires shared lock, because it sleeps writing
5094 	   to user space.
5095 	 */
5096 
5097 	if (cmd == SIOCGIFCONF) {
5098 		rtnl_lock();
5099 		ret = dev_ifconf(net, (char __user *) arg);
5100 		rtnl_unlock();
5101 		return ret;
5102 	}
5103 	if (cmd == SIOCGIFNAME)
5104 		return dev_ifname(net, (struct ifreq __user *)arg);
5105 
5106 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5107 		return -EFAULT;
5108 
5109 	ifr.ifr_name[IFNAMSIZ-1] = 0;
5110 
5111 	colon = strchr(ifr.ifr_name, ':');
5112 	if (colon)
5113 		*colon = 0;
5114 
5115 	/*
5116 	 *	See which interface the caller is talking about.
5117 	 */
5118 
5119 	switch (cmd) {
5120 	/*
5121 	 *	These ioctl calls:
5122 	 *	- can be done by all.
5123 	 *	- atomic and do not require locking.
5124 	 *	- return a value
5125 	 */
5126 	case SIOCGIFFLAGS:
5127 	case SIOCGIFMETRIC:
5128 	case SIOCGIFMTU:
5129 	case SIOCGIFHWADDR:
5130 	case SIOCGIFSLAVE:
5131 	case SIOCGIFMAP:
5132 	case SIOCGIFINDEX:
5133 	case SIOCGIFTXQLEN:
5134 		dev_load(net, ifr.ifr_name);
5135 		rcu_read_lock();
5136 		ret = dev_ifsioc_locked(net, &ifr, cmd);
5137 		rcu_read_unlock();
5138 		if (!ret) {
5139 			if (colon)
5140 				*colon = ':';
5141 			if (copy_to_user(arg, &ifr,
5142 					 sizeof(struct ifreq)))
5143 				ret = -EFAULT;
5144 		}
5145 		return ret;
5146 
5147 	case SIOCETHTOOL:
5148 		dev_load(net, ifr.ifr_name);
5149 		rtnl_lock();
5150 		ret = dev_ethtool(net, &ifr);
5151 		rtnl_unlock();
5152 		if (!ret) {
5153 			if (colon)
5154 				*colon = ':';
5155 			if (copy_to_user(arg, &ifr,
5156 					 sizeof(struct ifreq)))
5157 				ret = -EFAULT;
5158 		}
5159 		return ret;
5160 
5161 	/*
5162 	 *	These ioctl calls:
5163 	 *	- require superuser power.
5164 	 *	- require strict serialization.
5165 	 *	- return a value
5166 	 */
5167 	case SIOCGMIIPHY:
5168 	case SIOCGMIIREG:
5169 	case SIOCSIFNAME:
5170 		if (!capable(CAP_NET_ADMIN))
5171 			return -EPERM;
5172 		dev_load(net, ifr.ifr_name);
5173 		rtnl_lock();
5174 		ret = dev_ifsioc(net, &ifr, cmd);
5175 		rtnl_unlock();
5176 		if (!ret) {
5177 			if (colon)
5178 				*colon = ':';
5179 			if (copy_to_user(arg, &ifr,
5180 					 sizeof(struct ifreq)))
5181 				ret = -EFAULT;
5182 		}
5183 		return ret;
5184 
5185 	/*
5186 	 *	These ioctl calls:
5187 	 *	- require superuser power.
5188 	 *	- require strict serialization.
5189 	 *	- do not return a value
5190 	 */
5191 	case SIOCSIFFLAGS:
5192 	case SIOCSIFMETRIC:
5193 	case SIOCSIFMTU:
5194 	case SIOCSIFMAP:
5195 	case SIOCSIFHWADDR:
5196 	case SIOCSIFSLAVE:
5197 	case SIOCADDMULTI:
5198 	case SIOCDELMULTI:
5199 	case SIOCSIFHWBROADCAST:
5200 	case SIOCSIFTXQLEN:
5201 	case SIOCSMIIREG:
5202 	case SIOCBONDENSLAVE:
5203 	case SIOCBONDRELEASE:
5204 	case SIOCBONDSETHWADDR:
5205 	case SIOCBONDCHANGEACTIVE:
5206 	case SIOCBRADDIF:
5207 	case SIOCBRDELIF:
5208 	case SIOCSHWTSTAMP:
5209 		if (!capable(CAP_NET_ADMIN))
5210 			return -EPERM;
5211 		/* fall through */
5212 	case SIOCBONDSLAVEINFOQUERY:
5213 	case SIOCBONDINFOQUERY:
5214 		dev_load(net, ifr.ifr_name);
5215 		rtnl_lock();
5216 		ret = dev_ifsioc(net, &ifr, cmd);
5217 		rtnl_unlock();
5218 		return ret;
5219 
5220 	case SIOCGIFMEM:
5221 		/* Get the per device memory space. We can add this but
5222 		 * currently do not support it */
5223 	case SIOCSIFMEM:
5224 		/* Set the per device memory buffer space.
5225 		 * Not applicable in our case */
5226 	case SIOCSIFLINK:
5227 		return -ENOTTY;
5228 
5229 	/*
5230 	 *	Unknown or private ioctl.
5231 	 */
5232 	default:
5233 		if (cmd == SIOCWANDEV ||
5234 		    (cmd >= SIOCDEVPRIVATE &&
5235 		     cmd <= SIOCDEVPRIVATE + 15)) {
5236 			dev_load(net, ifr.ifr_name);
5237 			rtnl_lock();
5238 			ret = dev_ifsioc(net, &ifr, cmd);
5239 			rtnl_unlock();
5240 			if (!ret && copy_to_user(arg, &ifr,
5241 						 sizeof(struct ifreq)))
5242 				ret = -EFAULT;
5243 			return ret;
5244 		}
5245 		/* Take care of Wireless Extensions */
5246 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5247 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5248 		return -ENOTTY;
5249 	}
5250 }
5251 
5252 
5253 /**
5254  *	dev_new_index	-	allocate an ifindex
5255  *	@net: the applicable net namespace
5256  *
5257  *	Returns a suitable unique value for a new device interface
5258  *	number.  The caller must hold the rtnl semaphore or the
5259  *	dev_base_lock to be sure it remains unique.
5260  */
5261 static int dev_new_index(struct net *net)
5262 {
5263 	static int ifindex;
5264 	for (;;) {
5265 		if (++ifindex <= 0)
5266 			ifindex = 1;
5267 		if (!__dev_get_by_index(net, ifindex))
5268 			return ifindex;
5269 	}
5270 }
5271 
5272 /* Delayed registration/unregisteration */
5273 static LIST_HEAD(net_todo_list);
5274 
5275 static void net_set_todo(struct net_device *dev)
5276 {
5277 	list_add_tail(&dev->todo_list, &net_todo_list);
5278 }
5279 
5280 static void rollback_registered_many(struct list_head *head)
5281 {
5282 	struct net_device *dev, *tmp;
5283 
5284 	BUG_ON(dev_boot_phase);
5285 	ASSERT_RTNL();
5286 
5287 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5288 		/* Some devices call without registering
5289 		 * for initialization unwind. Remove those
5290 		 * devices and proceed with the remaining.
5291 		 */
5292 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5293 			pr_debug("unregister_netdevice: device %s/%p never "
5294 				 "was registered\n", dev->name, dev);
5295 
5296 			WARN_ON(1);
5297 			list_del(&dev->unreg_list);
5298 			continue;
5299 		}
5300 		dev->dismantle = true;
5301 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5302 	}
5303 
5304 	/* If device is running, close it first. */
5305 	dev_close_many(head);
5306 
5307 	list_for_each_entry(dev, head, unreg_list) {
5308 		/* And unlink it from device chain. */
5309 		unlist_netdevice(dev);
5310 
5311 		dev->reg_state = NETREG_UNREGISTERING;
5312 	}
5313 
5314 	synchronize_net();
5315 
5316 	list_for_each_entry(dev, head, unreg_list) {
5317 		/* Shutdown queueing discipline. */
5318 		dev_shutdown(dev);
5319 
5320 
5321 		/* Notify protocols, that we are about to destroy
5322 		   this device. They should clean all the things.
5323 		*/
5324 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5325 
5326 		if (!dev->rtnl_link_ops ||
5327 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5328 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5329 
5330 		/*
5331 		 *	Flush the unicast and multicast chains
5332 		 */
5333 		dev_uc_flush(dev);
5334 		dev_mc_flush(dev);
5335 
5336 		if (dev->netdev_ops->ndo_uninit)
5337 			dev->netdev_ops->ndo_uninit(dev);
5338 
5339 		/* Notifier chain MUST detach us from master device. */
5340 		WARN_ON(dev->master);
5341 
5342 		/* Remove entries from kobject tree */
5343 		netdev_unregister_kobject(dev);
5344 	}
5345 
5346 	/* Process any work delayed until the end of the batch */
5347 	dev = list_first_entry(head, struct net_device, unreg_list);
5348 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5349 
5350 	synchronize_net();
5351 
5352 	list_for_each_entry(dev, head, unreg_list)
5353 		dev_put(dev);
5354 }
5355 
5356 static void rollback_registered(struct net_device *dev)
5357 {
5358 	LIST_HEAD(single);
5359 
5360 	list_add(&dev->unreg_list, &single);
5361 	rollback_registered_many(&single);
5362 	list_del(&single);
5363 }
5364 
5365 static u32 netdev_fix_features(struct net_device *dev, u32 features)
5366 {
5367 	/* Fix illegal checksum combinations */
5368 	if ((features & NETIF_F_HW_CSUM) &&
5369 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5370 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5371 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5372 	}
5373 
5374 	if ((features & NETIF_F_NO_CSUM) &&
5375 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5376 		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5377 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5378 	}
5379 
5380 	/* Fix illegal SG+CSUM combinations. */
5381 	if ((features & NETIF_F_SG) &&
5382 	    !(features & NETIF_F_ALL_CSUM)) {
5383 		netdev_dbg(dev,
5384 			"Dropping NETIF_F_SG since no checksum feature.\n");
5385 		features &= ~NETIF_F_SG;
5386 	}
5387 
5388 	/* TSO requires that SG is present as well. */
5389 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5390 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5391 		features &= ~NETIF_F_ALL_TSO;
5392 	}
5393 
5394 	/* TSO ECN requires that TSO is present as well. */
5395 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5396 		features &= ~NETIF_F_TSO_ECN;
5397 
5398 	/* Software GSO depends on SG. */
5399 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5400 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5401 		features &= ~NETIF_F_GSO;
5402 	}
5403 
5404 	/* UFO needs SG and checksumming */
5405 	if (features & NETIF_F_UFO) {
5406 		/* maybe split UFO into V4 and V6? */
5407 		if (!((features & NETIF_F_GEN_CSUM) ||
5408 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5409 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5410 			netdev_dbg(dev,
5411 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5412 			features &= ~NETIF_F_UFO;
5413 		}
5414 
5415 		if (!(features & NETIF_F_SG)) {
5416 			netdev_dbg(dev,
5417 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5418 			features &= ~NETIF_F_UFO;
5419 		}
5420 	}
5421 
5422 	return features;
5423 }
5424 
5425 int __netdev_update_features(struct net_device *dev)
5426 {
5427 	u32 features;
5428 	int err = 0;
5429 
5430 	ASSERT_RTNL();
5431 
5432 	features = netdev_get_wanted_features(dev);
5433 
5434 	if (dev->netdev_ops->ndo_fix_features)
5435 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5436 
5437 	/* driver might be less strict about feature dependencies */
5438 	features = netdev_fix_features(dev, features);
5439 
5440 	if (dev->features == features)
5441 		return 0;
5442 
5443 	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5444 		dev->features, features);
5445 
5446 	if (dev->netdev_ops->ndo_set_features)
5447 		err = dev->netdev_ops->ndo_set_features(dev, features);
5448 
5449 	if (unlikely(err < 0)) {
5450 		netdev_err(dev,
5451 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5452 			err, features, dev->features);
5453 		return -1;
5454 	}
5455 
5456 	if (!err)
5457 		dev->features = features;
5458 
5459 	return 1;
5460 }
5461 
5462 /**
5463  *	netdev_update_features - recalculate device features
5464  *	@dev: the device to check
5465  *
5466  *	Recalculate dev->features set and send notifications if it
5467  *	has changed. Should be called after driver or hardware dependent
5468  *	conditions might have changed that influence the features.
5469  */
5470 void netdev_update_features(struct net_device *dev)
5471 {
5472 	if (__netdev_update_features(dev))
5473 		netdev_features_change(dev);
5474 }
5475 EXPORT_SYMBOL(netdev_update_features);
5476 
5477 /**
5478  *	netdev_change_features - recalculate device features
5479  *	@dev: the device to check
5480  *
5481  *	Recalculate dev->features set and send notifications even
5482  *	if they have not changed. Should be called instead of
5483  *	netdev_update_features() if also dev->vlan_features might
5484  *	have changed to allow the changes to be propagated to stacked
5485  *	VLAN devices.
5486  */
5487 void netdev_change_features(struct net_device *dev)
5488 {
5489 	__netdev_update_features(dev);
5490 	netdev_features_change(dev);
5491 }
5492 EXPORT_SYMBOL(netdev_change_features);
5493 
5494 /**
5495  *	netif_stacked_transfer_operstate -	transfer operstate
5496  *	@rootdev: the root or lower level device to transfer state from
5497  *	@dev: the device to transfer operstate to
5498  *
5499  *	Transfer operational state from root to device. This is normally
5500  *	called when a stacking relationship exists between the root
5501  *	device and the device(a leaf device).
5502  */
5503 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5504 					struct net_device *dev)
5505 {
5506 	if (rootdev->operstate == IF_OPER_DORMANT)
5507 		netif_dormant_on(dev);
5508 	else
5509 		netif_dormant_off(dev);
5510 
5511 	if (netif_carrier_ok(rootdev)) {
5512 		if (!netif_carrier_ok(dev))
5513 			netif_carrier_on(dev);
5514 	} else {
5515 		if (netif_carrier_ok(dev))
5516 			netif_carrier_off(dev);
5517 	}
5518 }
5519 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5520 
5521 #ifdef CONFIG_RPS
5522 static int netif_alloc_rx_queues(struct net_device *dev)
5523 {
5524 	unsigned int i, count = dev->num_rx_queues;
5525 	struct netdev_rx_queue *rx;
5526 
5527 	BUG_ON(count < 1);
5528 
5529 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5530 	if (!rx) {
5531 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5532 		return -ENOMEM;
5533 	}
5534 	dev->_rx = rx;
5535 
5536 	for (i = 0; i < count; i++)
5537 		rx[i].dev = dev;
5538 	return 0;
5539 }
5540 #endif
5541 
5542 static void netdev_init_one_queue(struct net_device *dev,
5543 				  struct netdev_queue *queue, void *_unused)
5544 {
5545 	/* Initialize queue lock */
5546 	spin_lock_init(&queue->_xmit_lock);
5547 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5548 	queue->xmit_lock_owner = -1;
5549 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5550 	queue->dev = dev;
5551 }
5552 
5553 static int netif_alloc_netdev_queues(struct net_device *dev)
5554 {
5555 	unsigned int count = dev->num_tx_queues;
5556 	struct netdev_queue *tx;
5557 
5558 	BUG_ON(count < 1);
5559 
5560 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5561 	if (!tx) {
5562 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5563 		       count);
5564 		return -ENOMEM;
5565 	}
5566 	dev->_tx = tx;
5567 
5568 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5569 	spin_lock_init(&dev->tx_global_lock);
5570 
5571 	return 0;
5572 }
5573 
5574 /**
5575  *	register_netdevice	- register a network device
5576  *	@dev: device to register
5577  *
5578  *	Take a completed network device structure and add it to the kernel
5579  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5580  *	chain. 0 is returned on success. A negative errno code is returned
5581  *	on a failure to set up the device, or if the name is a duplicate.
5582  *
5583  *	Callers must hold the rtnl semaphore. You may want
5584  *	register_netdev() instead of this.
5585  *
5586  *	BUGS:
5587  *	The locking appears insufficient to guarantee two parallel registers
5588  *	will not get the same name.
5589  */
5590 
5591 int register_netdevice(struct net_device *dev)
5592 {
5593 	int ret;
5594 	struct net *net = dev_net(dev);
5595 
5596 	BUG_ON(dev_boot_phase);
5597 	ASSERT_RTNL();
5598 
5599 	might_sleep();
5600 
5601 	/* When net_device's are persistent, this will be fatal. */
5602 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5603 	BUG_ON(!net);
5604 
5605 	spin_lock_init(&dev->addr_list_lock);
5606 	netdev_set_addr_lockdep_class(dev);
5607 
5608 	dev->iflink = -1;
5609 
5610 	ret = dev_get_valid_name(dev, dev->name);
5611 	if (ret < 0)
5612 		goto out;
5613 
5614 	/* Init, if this function is available */
5615 	if (dev->netdev_ops->ndo_init) {
5616 		ret = dev->netdev_ops->ndo_init(dev);
5617 		if (ret) {
5618 			if (ret > 0)
5619 				ret = -EIO;
5620 			goto out;
5621 		}
5622 	}
5623 
5624 	dev->ifindex = dev_new_index(net);
5625 	if (dev->iflink == -1)
5626 		dev->iflink = dev->ifindex;
5627 
5628 	/* Transfer changeable features to wanted_features and enable
5629 	 * software offloads (GSO and GRO).
5630 	 */
5631 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5632 	dev->features |= NETIF_F_SOFT_FEATURES;
5633 	dev->wanted_features = dev->features & dev->hw_features;
5634 
5635 	/* Turn on no cache copy if HW is doing checksum */
5636 	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5637 	if ((dev->features & NETIF_F_ALL_CSUM) &&
5638 	    !(dev->features & NETIF_F_NO_CSUM)) {
5639 		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5640 		dev->features |= NETIF_F_NOCACHE_COPY;
5641 	}
5642 
5643 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5644 	 */
5645 	dev->vlan_features |= NETIF_F_HIGHDMA;
5646 
5647 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5648 	ret = notifier_to_errno(ret);
5649 	if (ret)
5650 		goto err_uninit;
5651 
5652 	ret = netdev_register_kobject(dev);
5653 	if (ret)
5654 		goto err_uninit;
5655 	dev->reg_state = NETREG_REGISTERED;
5656 
5657 	__netdev_update_features(dev);
5658 
5659 	/*
5660 	 *	Default initial state at registry is that the
5661 	 *	device is present.
5662 	 */
5663 
5664 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5665 
5666 	dev_init_scheduler(dev);
5667 	dev_hold(dev);
5668 	list_netdevice(dev);
5669 
5670 	/* Notify protocols, that a new device appeared. */
5671 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5672 	ret = notifier_to_errno(ret);
5673 	if (ret) {
5674 		rollback_registered(dev);
5675 		dev->reg_state = NETREG_UNREGISTERED;
5676 	}
5677 	/*
5678 	 *	Prevent userspace races by waiting until the network
5679 	 *	device is fully setup before sending notifications.
5680 	 */
5681 	if (!dev->rtnl_link_ops ||
5682 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5683 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5684 
5685 out:
5686 	return ret;
5687 
5688 err_uninit:
5689 	if (dev->netdev_ops->ndo_uninit)
5690 		dev->netdev_ops->ndo_uninit(dev);
5691 	goto out;
5692 }
5693 EXPORT_SYMBOL(register_netdevice);
5694 
5695 /**
5696  *	init_dummy_netdev	- init a dummy network device for NAPI
5697  *	@dev: device to init
5698  *
5699  *	This takes a network device structure and initialize the minimum
5700  *	amount of fields so it can be used to schedule NAPI polls without
5701  *	registering a full blown interface. This is to be used by drivers
5702  *	that need to tie several hardware interfaces to a single NAPI
5703  *	poll scheduler due to HW limitations.
5704  */
5705 int init_dummy_netdev(struct net_device *dev)
5706 {
5707 	/* Clear everything. Note we don't initialize spinlocks
5708 	 * are they aren't supposed to be taken by any of the
5709 	 * NAPI code and this dummy netdev is supposed to be
5710 	 * only ever used for NAPI polls
5711 	 */
5712 	memset(dev, 0, sizeof(struct net_device));
5713 
5714 	/* make sure we BUG if trying to hit standard
5715 	 * register/unregister code path
5716 	 */
5717 	dev->reg_state = NETREG_DUMMY;
5718 
5719 	/* NAPI wants this */
5720 	INIT_LIST_HEAD(&dev->napi_list);
5721 
5722 	/* a dummy interface is started by default */
5723 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5724 	set_bit(__LINK_STATE_START, &dev->state);
5725 
5726 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5727 	 * because users of this 'device' dont need to change
5728 	 * its refcount.
5729 	 */
5730 
5731 	return 0;
5732 }
5733 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5734 
5735 
5736 /**
5737  *	register_netdev	- register a network device
5738  *	@dev: device to register
5739  *
5740  *	Take a completed network device structure and add it to the kernel
5741  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5742  *	chain. 0 is returned on success. A negative errno code is returned
5743  *	on a failure to set up the device, or if the name is a duplicate.
5744  *
5745  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5746  *	and expands the device name if you passed a format string to
5747  *	alloc_netdev.
5748  */
5749 int register_netdev(struct net_device *dev)
5750 {
5751 	int err;
5752 
5753 	rtnl_lock();
5754 	err = register_netdevice(dev);
5755 	rtnl_unlock();
5756 	return err;
5757 }
5758 EXPORT_SYMBOL(register_netdev);
5759 
5760 int netdev_refcnt_read(const struct net_device *dev)
5761 {
5762 	int i, refcnt = 0;
5763 
5764 	for_each_possible_cpu(i)
5765 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5766 	return refcnt;
5767 }
5768 EXPORT_SYMBOL(netdev_refcnt_read);
5769 
5770 /*
5771  * netdev_wait_allrefs - wait until all references are gone.
5772  *
5773  * This is called when unregistering network devices.
5774  *
5775  * Any protocol or device that holds a reference should register
5776  * for netdevice notification, and cleanup and put back the
5777  * reference if they receive an UNREGISTER event.
5778  * We can get stuck here if buggy protocols don't correctly
5779  * call dev_put.
5780  */
5781 static void netdev_wait_allrefs(struct net_device *dev)
5782 {
5783 	unsigned long rebroadcast_time, warning_time;
5784 	int refcnt;
5785 
5786 	linkwatch_forget_dev(dev);
5787 
5788 	rebroadcast_time = warning_time = jiffies;
5789 	refcnt = netdev_refcnt_read(dev);
5790 
5791 	while (refcnt != 0) {
5792 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5793 			rtnl_lock();
5794 
5795 			/* Rebroadcast unregister notification */
5796 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5797 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5798 			 * should have already handle it the first time */
5799 
5800 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5801 				     &dev->state)) {
5802 				/* We must not have linkwatch events
5803 				 * pending on unregister. If this
5804 				 * happens, we simply run the queue
5805 				 * unscheduled, resulting in a noop
5806 				 * for this device.
5807 				 */
5808 				linkwatch_run_queue();
5809 			}
5810 
5811 			__rtnl_unlock();
5812 
5813 			rebroadcast_time = jiffies;
5814 		}
5815 
5816 		msleep(250);
5817 
5818 		refcnt = netdev_refcnt_read(dev);
5819 
5820 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5821 			printk(KERN_EMERG "unregister_netdevice: "
5822 			       "waiting for %s to become free. Usage "
5823 			       "count = %d\n",
5824 			       dev->name, refcnt);
5825 			warning_time = jiffies;
5826 		}
5827 	}
5828 }
5829 
5830 /* The sequence is:
5831  *
5832  *	rtnl_lock();
5833  *	...
5834  *	register_netdevice(x1);
5835  *	register_netdevice(x2);
5836  *	...
5837  *	unregister_netdevice(y1);
5838  *	unregister_netdevice(y2);
5839  *      ...
5840  *	rtnl_unlock();
5841  *	free_netdev(y1);
5842  *	free_netdev(y2);
5843  *
5844  * We are invoked by rtnl_unlock().
5845  * This allows us to deal with problems:
5846  * 1) We can delete sysfs objects which invoke hotplug
5847  *    without deadlocking with linkwatch via keventd.
5848  * 2) Since we run with the RTNL semaphore not held, we can sleep
5849  *    safely in order to wait for the netdev refcnt to drop to zero.
5850  *
5851  * We must not return until all unregister events added during
5852  * the interval the lock was held have been completed.
5853  */
5854 void netdev_run_todo(void)
5855 {
5856 	struct list_head list;
5857 
5858 	/* Snapshot list, allow later requests */
5859 	list_replace_init(&net_todo_list, &list);
5860 
5861 	__rtnl_unlock();
5862 
5863 	/* Wait for rcu callbacks to finish before attempting to drain
5864 	 * the device list.  This usually avoids a 250ms wait.
5865 	 */
5866 	if (!list_empty(&list))
5867 		rcu_barrier();
5868 
5869 	while (!list_empty(&list)) {
5870 		struct net_device *dev
5871 			= list_first_entry(&list, struct net_device, todo_list);
5872 		list_del(&dev->todo_list);
5873 
5874 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5875 			printk(KERN_ERR "network todo '%s' but state %d\n",
5876 			       dev->name, dev->reg_state);
5877 			dump_stack();
5878 			continue;
5879 		}
5880 
5881 		dev->reg_state = NETREG_UNREGISTERED;
5882 
5883 		on_each_cpu(flush_backlog, dev, 1);
5884 
5885 		netdev_wait_allrefs(dev);
5886 
5887 		/* paranoia */
5888 		BUG_ON(netdev_refcnt_read(dev));
5889 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5890 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5891 		WARN_ON(dev->dn_ptr);
5892 
5893 		if (dev->destructor)
5894 			dev->destructor(dev);
5895 
5896 		/* Free network device */
5897 		kobject_put(&dev->dev.kobj);
5898 	}
5899 }
5900 
5901 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5902  * fields in the same order, with only the type differing.
5903  */
5904 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5905 				    const struct net_device_stats *netdev_stats)
5906 {
5907 #if BITS_PER_LONG == 64
5908         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5909         memcpy(stats64, netdev_stats, sizeof(*stats64));
5910 #else
5911 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5912 	const unsigned long *src = (const unsigned long *)netdev_stats;
5913 	u64 *dst = (u64 *)stats64;
5914 
5915 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5916 		     sizeof(*stats64) / sizeof(u64));
5917 	for (i = 0; i < n; i++)
5918 		dst[i] = src[i];
5919 #endif
5920 }
5921 
5922 /**
5923  *	dev_get_stats	- get network device statistics
5924  *	@dev: device to get statistics from
5925  *	@storage: place to store stats
5926  *
5927  *	Get network statistics from device. Return @storage.
5928  *	The device driver may provide its own method by setting
5929  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5930  *	otherwise the internal statistics structure is used.
5931  */
5932 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5933 					struct rtnl_link_stats64 *storage)
5934 {
5935 	const struct net_device_ops *ops = dev->netdev_ops;
5936 
5937 	if (ops->ndo_get_stats64) {
5938 		memset(storage, 0, sizeof(*storage));
5939 		ops->ndo_get_stats64(dev, storage);
5940 	} else if (ops->ndo_get_stats) {
5941 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5942 	} else {
5943 		netdev_stats_to_stats64(storage, &dev->stats);
5944 	}
5945 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5946 	return storage;
5947 }
5948 EXPORT_SYMBOL(dev_get_stats);
5949 
5950 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5951 {
5952 	struct netdev_queue *queue = dev_ingress_queue(dev);
5953 
5954 #ifdef CONFIG_NET_CLS_ACT
5955 	if (queue)
5956 		return queue;
5957 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5958 	if (!queue)
5959 		return NULL;
5960 	netdev_init_one_queue(dev, queue, NULL);
5961 	queue->qdisc = &noop_qdisc;
5962 	queue->qdisc_sleeping = &noop_qdisc;
5963 	rcu_assign_pointer(dev->ingress_queue, queue);
5964 #endif
5965 	return queue;
5966 }
5967 
5968 /**
5969  *	alloc_netdev_mqs - allocate network device
5970  *	@sizeof_priv:	size of private data to allocate space for
5971  *	@name:		device name format string
5972  *	@setup:		callback to initialize device
5973  *	@txqs:		the number of TX subqueues to allocate
5974  *	@rxqs:		the number of RX subqueues to allocate
5975  *
5976  *	Allocates a struct net_device with private data area for driver use
5977  *	and performs basic initialization.  Also allocates subquue structs
5978  *	for each queue on the device.
5979  */
5980 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5981 		void (*setup)(struct net_device *),
5982 		unsigned int txqs, unsigned int rxqs)
5983 {
5984 	struct net_device *dev;
5985 	size_t alloc_size;
5986 	struct net_device *p;
5987 
5988 	BUG_ON(strlen(name) >= sizeof(dev->name));
5989 
5990 	if (txqs < 1) {
5991 		pr_err("alloc_netdev: Unable to allocate device "
5992 		       "with zero queues.\n");
5993 		return NULL;
5994 	}
5995 
5996 #ifdef CONFIG_RPS
5997 	if (rxqs < 1) {
5998 		pr_err("alloc_netdev: Unable to allocate device "
5999 		       "with zero RX queues.\n");
6000 		return NULL;
6001 	}
6002 #endif
6003 
6004 	alloc_size = sizeof(struct net_device);
6005 	if (sizeof_priv) {
6006 		/* ensure 32-byte alignment of private area */
6007 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6008 		alloc_size += sizeof_priv;
6009 	}
6010 	/* ensure 32-byte alignment of whole construct */
6011 	alloc_size += NETDEV_ALIGN - 1;
6012 
6013 	p = kzalloc(alloc_size, GFP_KERNEL);
6014 	if (!p) {
6015 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6016 		return NULL;
6017 	}
6018 
6019 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6020 	dev->padded = (char *)dev - (char *)p;
6021 
6022 	dev->pcpu_refcnt = alloc_percpu(int);
6023 	if (!dev->pcpu_refcnt)
6024 		goto free_p;
6025 
6026 	if (dev_addr_init(dev))
6027 		goto free_pcpu;
6028 
6029 	dev_mc_init(dev);
6030 	dev_uc_init(dev);
6031 
6032 	dev_net_set(dev, &init_net);
6033 
6034 	dev->gso_max_size = GSO_MAX_SIZE;
6035 
6036 	INIT_LIST_HEAD(&dev->napi_list);
6037 	INIT_LIST_HEAD(&dev->unreg_list);
6038 	INIT_LIST_HEAD(&dev->link_watch_list);
6039 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6040 	setup(dev);
6041 
6042 	dev->num_tx_queues = txqs;
6043 	dev->real_num_tx_queues = txqs;
6044 	if (netif_alloc_netdev_queues(dev))
6045 		goto free_all;
6046 
6047 #ifdef CONFIG_RPS
6048 	dev->num_rx_queues = rxqs;
6049 	dev->real_num_rx_queues = rxqs;
6050 	if (netif_alloc_rx_queues(dev))
6051 		goto free_all;
6052 #endif
6053 
6054 	strcpy(dev->name, name);
6055 	dev->group = INIT_NETDEV_GROUP;
6056 	return dev;
6057 
6058 free_all:
6059 	free_netdev(dev);
6060 	return NULL;
6061 
6062 free_pcpu:
6063 	free_percpu(dev->pcpu_refcnt);
6064 	kfree(dev->_tx);
6065 #ifdef CONFIG_RPS
6066 	kfree(dev->_rx);
6067 #endif
6068 
6069 free_p:
6070 	kfree(p);
6071 	return NULL;
6072 }
6073 EXPORT_SYMBOL(alloc_netdev_mqs);
6074 
6075 /**
6076  *	free_netdev - free network device
6077  *	@dev: device
6078  *
6079  *	This function does the last stage of destroying an allocated device
6080  * 	interface. The reference to the device object is released.
6081  *	If this is the last reference then it will be freed.
6082  */
6083 void free_netdev(struct net_device *dev)
6084 {
6085 	struct napi_struct *p, *n;
6086 
6087 	release_net(dev_net(dev));
6088 
6089 	kfree(dev->_tx);
6090 #ifdef CONFIG_RPS
6091 	kfree(dev->_rx);
6092 #endif
6093 
6094 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6095 
6096 	/* Flush device addresses */
6097 	dev_addr_flush(dev);
6098 
6099 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6100 		netif_napi_del(p);
6101 
6102 	free_percpu(dev->pcpu_refcnt);
6103 	dev->pcpu_refcnt = NULL;
6104 
6105 	/*  Compatibility with error handling in drivers */
6106 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6107 		kfree((char *)dev - dev->padded);
6108 		return;
6109 	}
6110 
6111 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6112 	dev->reg_state = NETREG_RELEASED;
6113 
6114 	/* will free via device release */
6115 	put_device(&dev->dev);
6116 }
6117 EXPORT_SYMBOL(free_netdev);
6118 
6119 /**
6120  *	synchronize_net -  Synchronize with packet receive processing
6121  *
6122  *	Wait for packets currently being received to be done.
6123  *	Does not block later packets from starting.
6124  */
6125 void synchronize_net(void)
6126 {
6127 	might_sleep();
6128 	if (rtnl_is_locked())
6129 		synchronize_rcu_expedited();
6130 	else
6131 		synchronize_rcu();
6132 }
6133 EXPORT_SYMBOL(synchronize_net);
6134 
6135 /**
6136  *	unregister_netdevice_queue - remove device from the kernel
6137  *	@dev: device
6138  *	@head: list
6139  *
6140  *	This function shuts down a device interface and removes it
6141  *	from the kernel tables.
6142  *	If head not NULL, device is queued to be unregistered later.
6143  *
6144  *	Callers must hold the rtnl semaphore.  You may want
6145  *	unregister_netdev() instead of this.
6146  */
6147 
6148 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6149 {
6150 	ASSERT_RTNL();
6151 
6152 	if (head) {
6153 		list_move_tail(&dev->unreg_list, head);
6154 	} else {
6155 		rollback_registered(dev);
6156 		/* Finish processing unregister after unlock */
6157 		net_set_todo(dev);
6158 	}
6159 }
6160 EXPORT_SYMBOL(unregister_netdevice_queue);
6161 
6162 /**
6163  *	unregister_netdevice_many - unregister many devices
6164  *	@head: list of devices
6165  */
6166 void unregister_netdevice_many(struct list_head *head)
6167 {
6168 	struct net_device *dev;
6169 
6170 	if (!list_empty(head)) {
6171 		rollback_registered_many(head);
6172 		list_for_each_entry(dev, head, unreg_list)
6173 			net_set_todo(dev);
6174 	}
6175 }
6176 EXPORT_SYMBOL(unregister_netdevice_many);
6177 
6178 /**
6179  *	unregister_netdev - remove device from the kernel
6180  *	@dev: device
6181  *
6182  *	This function shuts down a device interface and removes it
6183  *	from the kernel tables.
6184  *
6185  *	This is just a wrapper for unregister_netdevice that takes
6186  *	the rtnl semaphore.  In general you want to use this and not
6187  *	unregister_netdevice.
6188  */
6189 void unregister_netdev(struct net_device *dev)
6190 {
6191 	rtnl_lock();
6192 	unregister_netdevice(dev);
6193 	rtnl_unlock();
6194 }
6195 EXPORT_SYMBOL(unregister_netdev);
6196 
6197 /**
6198  *	dev_change_net_namespace - move device to different nethost namespace
6199  *	@dev: device
6200  *	@net: network namespace
6201  *	@pat: If not NULL name pattern to try if the current device name
6202  *	      is already taken in the destination network namespace.
6203  *
6204  *	This function shuts down a device interface and moves it
6205  *	to a new network namespace. On success 0 is returned, on
6206  *	a failure a netagive errno code is returned.
6207  *
6208  *	Callers must hold the rtnl semaphore.
6209  */
6210 
6211 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6212 {
6213 	int err;
6214 
6215 	ASSERT_RTNL();
6216 
6217 	/* Don't allow namespace local devices to be moved. */
6218 	err = -EINVAL;
6219 	if (dev->features & NETIF_F_NETNS_LOCAL)
6220 		goto out;
6221 
6222 	/* Ensure the device has been registrered */
6223 	err = -EINVAL;
6224 	if (dev->reg_state != NETREG_REGISTERED)
6225 		goto out;
6226 
6227 	/* Get out if there is nothing todo */
6228 	err = 0;
6229 	if (net_eq(dev_net(dev), net))
6230 		goto out;
6231 
6232 	/* Pick the destination device name, and ensure
6233 	 * we can use it in the destination network namespace.
6234 	 */
6235 	err = -EEXIST;
6236 	if (__dev_get_by_name(net, dev->name)) {
6237 		/* We get here if we can't use the current device name */
6238 		if (!pat)
6239 			goto out;
6240 		if (dev_get_valid_name(dev, pat) < 0)
6241 			goto out;
6242 	}
6243 
6244 	/*
6245 	 * And now a mini version of register_netdevice unregister_netdevice.
6246 	 */
6247 
6248 	/* If device is running close it first. */
6249 	dev_close(dev);
6250 
6251 	/* And unlink it from device chain */
6252 	err = -ENODEV;
6253 	unlist_netdevice(dev);
6254 
6255 	synchronize_net();
6256 
6257 	/* Shutdown queueing discipline. */
6258 	dev_shutdown(dev);
6259 
6260 	/* Notify protocols, that we are about to destroy
6261 	   this device. They should clean all the things.
6262 
6263 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6264 	   This is wanted because this way 8021q and macvlan know
6265 	   the device is just moving and can keep their slaves up.
6266 	*/
6267 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6268 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6269 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6270 
6271 	/*
6272 	 *	Flush the unicast and multicast chains
6273 	 */
6274 	dev_uc_flush(dev);
6275 	dev_mc_flush(dev);
6276 
6277 	/* Actually switch the network namespace */
6278 	dev_net_set(dev, net);
6279 
6280 	/* If there is an ifindex conflict assign a new one */
6281 	if (__dev_get_by_index(net, dev->ifindex)) {
6282 		int iflink = (dev->iflink == dev->ifindex);
6283 		dev->ifindex = dev_new_index(net);
6284 		if (iflink)
6285 			dev->iflink = dev->ifindex;
6286 	}
6287 
6288 	/* Fixup kobjects */
6289 	err = device_rename(&dev->dev, dev->name);
6290 	WARN_ON(err);
6291 
6292 	/* Add the device back in the hashes */
6293 	list_netdevice(dev);
6294 
6295 	/* Notify protocols, that a new device appeared. */
6296 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6297 
6298 	/*
6299 	 *	Prevent userspace races by waiting until the network
6300 	 *	device is fully setup before sending notifications.
6301 	 */
6302 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6303 
6304 	synchronize_net();
6305 	err = 0;
6306 out:
6307 	return err;
6308 }
6309 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6310 
6311 static int dev_cpu_callback(struct notifier_block *nfb,
6312 			    unsigned long action,
6313 			    void *ocpu)
6314 {
6315 	struct sk_buff **list_skb;
6316 	struct sk_buff *skb;
6317 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6318 	struct softnet_data *sd, *oldsd;
6319 
6320 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6321 		return NOTIFY_OK;
6322 
6323 	local_irq_disable();
6324 	cpu = smp_processor_id();
6325 	sd = &per_cpu(softnet_data, cpu);
6326 	oldsd = &per_cpu(softnet_data, oldcpu);
6327 
6328 	/* Find end of our completion_queue. */
6329 	list_skb = &sd->completion_queue;
6330 	while (*list_skb)
6331 		list_skb = &(*list_skb)->next;
6332 	/* Append completion queue from offline CPU. */
6333 	*list_skb = oldsd->completion_queue;
6334 	oldsd->completion_queue = NULL;
6335 
6336 	/* Append output queue from offline CPU. */
6337 	if (oldsd->output_queue) {
6338 		*sd->output_queue_tailp = oldsd->output_queue;
6339 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6340 		oldsd->output_queue = NULL;
6341 		oldsd->output_queue_tailp = &oldsd->output_queue;
6342 	}
6343 	/* Append NAPI poll list from offline CPU. */
6344 	if (!list_empty(&oldsd->poll_list)) {
6345 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6346 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6347 	}
6348 
6349 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6350 	local_irq_enable();
6351 
6352 	/* Process offline CPU's input_pkt_queue */
6353 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6354 		netif_rx(skb);
6355 		input_queue_head_incr(oldsd);
6356 	}
6357 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6358 		netif_rx(skb);
6359 		input_queue_head_incr(oldsd);
6360 	}
6361 
6362 	return NOTIFY_OK;
6363 }
6364 
6365 
6366 /**
6367  *	netdev_increment_features - increment feature set by one
6368  *	@all: current feature set
6369  *	@one: new feature set
6370  *	@mask: mask feature set
6371  *
6372  *	Computes a new feature set after adding a device with feature set
6373  *	@one to the master device with current feature set @all.  Will not
6374  *	enable anything that is off in @mask. Returns the new feature set.
6375  */
6376 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6377 {
6378 	if (mask & NETIF_F_GEN_CSUM)
6379 		mask |= NETIF_F_ALL_CSUM;
6380 	mask |= NETIF_F_VLAN_CHALLENGED;
6381 
6382 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6383 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6384 
6385 	/* If device needs checksumming, downgrade to it. */
6386 	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6387 		all &= ~NETIF_F_NO_CSUM;
6388 
6389 	/* If one device supports hw checksumming, set for all. */
6390 	if (all & NETIF_F_GEN_CSUM)
6391 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6392 
6393 	return all;
6394 }
6395 EXPORT_SYMBOL(netdev_increment_features);
6396 
6397 static struct hlist_head *netdev_create_hash(void)
6398 {
6399 	int i;
6400 	struct hlist_head *hash;
6401 
6402 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6403 	if (hash != NULL)
6404 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6405 			INIT_HLIST_HEAD(&hash[i]);
6406 
6407 	return hash;
6408 }
6409 
6410 /* Initialize per network namespace state */
6411 static int __net_init netdev_init(struct net *net)
6412 {
6413 	INIT_LIST_HEAD(&net->dev_base_head);
6414 
6415 	net->dev_name_head = netdev_create_hash();
6416 	if (net->dev_name_head == NULL)
6417 		goto err_name;
6418 
6419 	net->dev_index_head = netdev_create_hash();
6420 	if (net->dev_index_head == NULL)
6421 		goto err_idx;
6422 
6423 	return 0;
6424 
6425 err_idx:
6426 	kfree(net->dev_name_head);
6427 err_name:
6428 	return -ENOMEM;
6429 }
6430 
6431 /**
6432  *	netdev_drivername - network driver for the device
6433  *	@dev: network device
6434  *
6435  *	Determine network driver for device.
6436  */
6437 const char *netdev_drivername(const struct net_device *dev)
6438 {
6439 	const struct device_driver *driver;
6440 	const struct device *parent;
6441 	const char *empty = "";
6442 
6443 	parent = dev->dev.parent;
6444 	if (!parent)
6445 		return empty;
6446 
6447 	driver = parent->driver;
6448 	if (driver && driver->name)
6449 		return driver->name;
6450 	return empty;
6451 }
6452 
6453 int __netdev_printk(const char *level, const struct net_device *dev,
6454 			   struct va_format *vaf)
6455 {
6456 	int r;
6457 
6458 	if (dev && dev->dev.parent)
6459 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6460 			       netdev_name(dev), vaf);
6461 	else if (dev)
6462 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6463 	else
6464 		r = printk("%s(NULL net_device): %pV", level, vaf);
6465 
6466 	return r;
6467 }
6468 EXPORT_SYMBOL(__netdev_printk);
6469 
6470 int netdev_printk(const char *level, const struct net_device *dev,
6471 		  const char *format, ...)
6472 {
6473 	struct va_format vaf;
6474 	va_list args;
6475 	int r;
6476 
6477 	va_start(args, format);
6478 
6479 	vaf.fmt = format;
6480 	vaf.va = &args;
6481 
6482 	r = __netdev_printk(level, dev, &vaf);
6483 	va_end(args);
6484 
6485 	return r;
6486 }
6487 EXPORT_SYMBOL(netdev_printk);
6488 
6489 #define define_netdev_printk_level(func, level)			\
6490 int func(const struct net_device *dev, const char *fmt, ...)	\
6491 {								\
6492 	int r;							\
6493 	struct va_format vaf;					\
6494 	va_list args;						\
6495 								\
6496 	va_start(args, fmt);					\
6497 								\
6498 	vaf.fmt = fmt;						\
6499 	vaf.va = &args;						\
6500 								\
6501 	r = __netdev_printk(level, dev, &vaf);			\
6502 	va_end(args);						\
6503 								\
6504 	return r;						\
6505 }								\
6506 EXPORT_SYMBOL(func);
6507 
6508 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6509 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6510 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6511 define_netdev_printk_level(netdev_err, KERN_ERR);
6512 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6513 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6514 define_netdev_printk_level(netdev_info, KERN_INFO);
6515 
6516 static void __net_exit netdev_exit(struct net *net)
6517 {
6518 	kfree(net->dev_name_head);
6519 	kfree(net->dev_index_head);
6520 }
6521 
6522 static struct pernet_operations __net_initdata netdev_net_ops = {
6523 	.init = netdev_init,
6524 	.exit = netdev_exit,
6525 };
6526 
6527 static void __net_exit default_device_exit(struct net *net)
6528 {
6529 	struct net_device *dev, *aux;
6530 	/*
6531 	 * Push all migratable network devices back to the
6532 	 * initial network namespace
6533 	 */
6534 	rtnl_lock();
6535 	for_each_netdev_safe(net, dev, aux) {
6536 		int err;
6537 		char fb_name[IFNAMSIZ];
6538 
6539 		/* Ignore unmoveable devices (i.e. loopback) */
6540 		if (dev->features & NETIF_F_NETNS_LOCAL)
6541 			continue;
6542 
6543 		/* Leave virtual devices for the generic cleanup */
6544 		if (dev->rtnl_link_ops)
6545 			continue;
6546 
6547 		/* Push remaining network devices to init_net */
6548 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6549 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6550 		if (err) {
6551 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6552 				__func__, dev->name, err);
6553 			BUG();
6554 		}
6555 	}
6556 	rtnl_unlock();
6557 }
6558 
6559 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6560 {
6561 	/* At exit all network devices most be removed from a network
6562 	 * namespace.  Do this in the reverse order of registration.
6563 	 * Do this across as many network namespaces as possible to
6564 	 * improve batching efficiency.
6565 	 */
6566 	struct net_device *dev;
6567 	struct net *net;
6568 	LIST_HEAD(dev_kill_list);
6569 
6570 	rtnl_lock();
6571 	list_for_each_entry(net, net_list, exit_list) {
6572 		for_each_netdev_reverse(net, dev) {
6573 			if (dev->rtnl_link_ops)
6574 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6575 			else
6576 				unregister_netdevice_queue(dev, &dev_kill_list);
6577 		}
6578 	}
6579 	unregister_netdevice_many(&dev_kill_list);
6580 	list_del(&dev_kill_list);
6581 	rtnl_unlock();
6582 }
6583 
6584 static struct pernet_operations __net_initdata default_device_ops = {
6585 	.exit = default_device_exit,
6586 	.exit_batch = default_device_exit_batch,
6587 };
6588 
6589 /*
6590  *	Initialize the DEV module. At boot time this walks the device list and
6591  *	unhooks any devices that fail to initialise (normally hardware not
6592  *	present) and leaves us with a valid list of present and active devices.
6593  *
6594  */
6595 
6596 /*
6597  *       This is called single threaded during boot, so no need
6598  *       to take the rtnl semaphore.
6599  */
6600 static int __init net_dev_init(void)
6601 {
6602 	int i, rc = -ENOMEM;
6603 
6604 	BUG_ON(!dev_boot_phase);
6605 
6606 	if (dev_proc_init())
6607 		goto out;
6608 
6609 	if (netdev_kobject_init())
6610 		goto out;
6611 
6612 	INIT_LIST_HEAD(&ptype_all);
6613 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6614 		INIT_LIST_HEAD(&ptype_base[i]);
6615 
6616 	if (register_pernet_subsys(&netdev_net_ops))
6617 		goto out;
6618 
6619 	/*
6620 	 *	Initialise the packet receive queues.
6621 	 */
6622 
6623 	for_each_possible_cpu(i) {
6624 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6625 
6626 		memset(sd, 0, sizeof(*sd));
6627 		skb_queue_head_init(&sd->input_pkt_queue);
6628 		skb_queue_head_init(&sd->process_queue);
6629 		sd->completion_queue = NULL;
6630 		INIT_LIST_HEAD(&sd->poll_list);
6631 		sd->output_queue = NULL;
6632 		sd->output_queue_tailp = &sd->output_queue;
6633 #ifdef CONFIG_RPS
6634 		sd->csd.func = rps_trigger_softirq;
6635 		sd->csd.info = sd;
6636 		sd->csd.flags = 0;
6637 		sd->cpu = i;
6638 #endif
6639 
6640 		sd->backlog.poll = process_backlog;
6641 		sd->backlog.weight = weight_p;
6642 		sd->backlog.gro_list = NULL;
6643 		sd->backlog.gro_count = 0;
6644 	}
6645 
6646 	dev_boot_phase = 0;
6647 
6648 	/* The loopback device is special if any other network devices
6649 	 * is present in a network namespace the loopback device must
6650 	 * be present. Since we now dynamically allocate and free the
6651 	 * loopback device ensure this invariant is maintained by
6652 	 * keeping the loopback device as the first device on the
6653 	 * list of network devices.  Ensuring the loopback devices
6654 	 * is the first device that appears and the last network device
6655 	 * that disappears.
6656 	 */
6657 	if (register_pernet_device(&loopback_net_ops))
6658 		goto out;
6659 
6660 	if (register_pernet_device(&default_device_ops))
6661 		goto out;
6662 
6663 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6664 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6665 
6666 	hotcpu_notifier(dev_cpu_callback, 0);
6667 	dst_init();
6668 	dev_mcast_init();
6669 	rc = 0;
6670 out:
6671 	return rc;
6672 }
6673 
6674 subsys_initcall(net_dev_init);
6675 
6676 static int __init initialize_hashrnd(void)
6677 {
6678 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6679 	return 0;
6680 }
6681 
6682 late_initcall_sync(initialize_hashrnd);
6683 
6684